| { | |
| "title": "Reinforcement Learning Policy Mastery: 100 MCQs", | |
| "description": "A comprehensive set of 100 multiple-choice questions covering RL Policies, including actions, policy types (deterministic, stochastic, greedy, ε-greedy), optimization, exploration vs. exploitation, and real-world scenarios.", | |
| "questions": [ | |
| { | |
| "id": 1, | |
| "questionText": "In reinforcement learning, what is an action?", | |
| "options": [ | |
| "A fixed rule of the environment", | |
| "A numerical reward signal", | |
| "A choice made by the agent to interact with the environment", | |
| "Information received from the environment" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "An action is the decision taken by the agent to affect the environment." | |
| }, | |
| { | |
| "id": 2, | |
| "questionText": "What is a policy in reinforcement learning?", | |
| "options": [ | |
| "A mapping from states to actions", | |
| "A sequence of states", | |
| "The transition probability", | |
| "The total reward received" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "A policy is a strategy that decides which action to take in each state." | |
| }, | |
| { | |
| "id": 3, | |
| "questionText": "A policy is denoted mathematically as:", | |
| "options": [ | |
| "γ", | |
| "R(s)", | |
| "π(a|s)", | |
| "P(s’|s,a)" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "π(a|s) defines the probability of taking action a when in state s." | |
| }, | |
| { | |
| "id": 4, | |
| "questionText": "A deterministic policy always:", | |
| "options": [ | |
| "Selects the same action for a given state", | |
| "Depends only on reward", | |
| "Selects random actions", | |
| "Changes over time" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "A deterministic policy selects the same action every time for a specific state." | |
| }, | |
| { | |
| "id": 5, | |
| "questionText": "A stochastic policy:", | |
| "options": [ | |
| "Does not involve randomness", | |
| "Ignores the current state", | |
| "Gives a probability distribution over actions", | |
| "Always selects the same action" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "A stochastic policy gives probabilities for each possible action." | |
| }, | |
| { | |
| "id": 6, | |
| "questionText": "Which of the following represents an action set?", | |
| "options": [ | |
| "History of transitions", | |
| "All possible states in the environment", | |
| "All rewards received", | |
| "All possible decisions the agent can make at a state" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The action set is the list of all actions the agent can choose." | |
| }, | |
| { | |
| "id": 7, | |
| "questionText": "A policy helps the agent to:", | |
| "options": [ | |
| "Modify the environment", | |
| "Decide which action to take", | |
| "Terminate learning immediately", | |
| "Change the reward function" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "A policy directly guides the agent’s decisions at each state." | |
| }, | |
| { | |
| "id": 8, | |
| "questionText": "An optimal policy aims to:", | |
| "options": [ | |
| "Only get immediate rewards", | |
| "Avoid terminal states", | |
| "Maximize long-term cumulative reward", | |
| "Minimize actions taken" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Optimal policies aim to maximize cumulative return, not just immediate rewards." | |
| }, | |
| { | |
| "id": 9, | |
| "questionText": "Which symbol is commonly used for policy?", | |
| "options": [ | |
| "λ (lambda)", | |
| "δ (delta)", | |
| "π (pi)", | |
| "μ (mu)" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Policy is conventionally represented as π in reinforcement learning literature." | |
| }, | |
| { | |
| "id": 10, | |
| "questionText": "In reinforcement learning, actions are chosen based on:", | |
| "options": [ | |
| "Fixed environment rules", | |
| "The current policy", | |
| "Random guessing", | |
| "The reward only" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "The agent follows its policy to choose an action in each state." | |
| }, | |
| { | |
| "id": 11, | |
| "questionText": "A policy π(a|s) defines:", | |
| "options": [ | |
| "The reward received after an action", | |
| "The probability of taking action a in state s", | |
| "The discount factor", | |
| "The next state transition" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "π(a|s) gives the probability that the agent selects action a when it is in state s." | |
| }, | |
| { | |
| "id": 12, | |
| "questionText": "Which type of policy is commonly used during exploration?", | |
| "options": [ | |
| "Deterministic policy", | |
| "Static policy", | |
| "Stochastic policy", | |
| "Greedy policy" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Stochastic policies allow randomness and help the agent to explore better." | |
| }, | |
| { | |
| "id": 13, | |
| "questionText": "What does a greedy policy do?", | |
| "options": [ | |
| "Always selects the action with the highest estimated value", | |
| "Selects random actions", | |
| "Avoids high rewards", | |
| "Maximizes long-term regret" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "A greedy policy always selects the action currently believed to give the highest reward." | |
| }, | |
| { | |
| "id": 14, | |
| "questionText": "The ε-greedy policy:", | |
| "options": [ | |
| "Always exploits the best action", | |
| "Balances exploration and exploitation", | |
| "Never selects random actions", | |
| "Explores only" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "ε-greedy policy chooses the best action most of the time, but sometimes explores randomly." | |
| }, | |
| { | |
| "id": 15, | |
| "questionText": "A policy directly controls:", | |
| "options": [ | |
| "How rewards are given", | |
| "The discount factor", | |
| "Which action the agent selects", | |
| "The transition probability" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The policy is responsible for selecting which action to take in each state." | |
| }, | |
| { | |
| "id": 16, | |
| "questionText": "Which of the following is true for a deterministic policy?", | |
| "options": [ | |
| "It randomly selects an action", | |
| "It maps each state to exactly one action", | |
| "It gives probabilities for actions", | |
| "It has no control over actions" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Deterministic policy chooses exactly one action for a given state." | |
| }, | |
| { | |
| "id": 17, | |
| "questionText": "In reinforcement learning, the agent follows a policy to:", | |
| "options": [ | |
| "Generate rewards", | |
| "Stop learning", | |
| "Define terminal states", | |
| "Decide actions" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The policy guides the agent in selecting actions during interaction." | |
| }, | |
| { | |
| "id": 18, | |
| "questionText": "A policy that selects actions purely based on current reward and ignores future rewards is:", | |
| "options": [ | |
| "Myopic", | |
| "Long-term", | |
| "Optimal", | |
| "Model-based" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "A myopic policy only focuses on immediate rewards and not the future." | |
| }, | |
| { | |
| "id": 19, | |
| "questionText": "Which of the following best describes the role of a policy?", | |
| "options": [ | |
| "It stores previous rewards", | |
| "It defines the agent’s behavior strategy", | |
| "It calculates final reward only", | |
| "It controls environment dynamics" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "A policy is the strategy that the agent uses to decide what action to take." | |
| }, | |
| { | |
| "id": 20, | |
| "questionText": "What is the goal of policy optimization?", | |
| "options": [ | |
| "To eliminate all randomness", | |
| "To improve the policy for higher long-term rewards", | |
| "To reduce the number of states", | |
| "To modify the reward function" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Policy optimization aims to improve decision-making for maximum cumulative reward." | |
| }, | |
| { | |
| "id": 21, | |
| "questionText": "Which type of policy is preferred during exploitation?", | |
| "options": [ | |
| "Exploratory", | |
| "Myopic", | |
| "Fully random", | |
| "Deterministic" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Exploitation prefers deterministic behavior for consistent maximum return." | |
| }, | |
| { | |
| "id": 22, | |
| "questionText": "What does ε represent in ε-greedy policy?", | |
| "options": [ | |
| "Learning rate", | |
| "Probability of exploration", | |
| "State transition cost", | |
| "Discount factor" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "ε controls how often the agent explores randomly instead of exploiting." | |
| }, | |
| { | |
| "id": 23, | |
| "questionText": "A policy that changes and improves over time is called:", | |
| "options": [ | |
| "Fixed policy", | |
| "Adaptive policy", | |
| "Terminal policy", | |
| "Static policy" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Adaptive policies evolve with learning and improve performance over time." | |
| }, | |
| { | |
| "id": 24, | |
| "questionText": "Which algorithm often improves policy iteratively?", | |
| "options": [ | |
| "Clustering", | |
| "Bubble sort", | |
| "Policy gradient", | |
| "Binary search" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Policy gradient is a reinforcement learning method specifically designed to optimize policies." | |
| }, | |
| { | |
| "id": 25, | |
| "questionText": "The policy essentially forms the agent’s:", | |
| "options": [ | |
| "Memory buffer", | |
| "Reward system", | |
| "Behavior strategy", | |
| "Environment model" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The policy is the behavior strategy used to choose actions." | |
| }, | |
| { | |
| "id": 26, | |
| "questionText": "Which policy guarantees the same output for each particular input state?", | |
| "options": [ | |
| "Random policy", | |
| "Stochastic policy", | |
| "Deterministic policy", | |
| "Exploration-only policy" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Deterministic policies always map one state to one fixed action." | |
| }, | |
| { | |
| "id": 27, | |
| "questionText": "What is the output of a policy?", | |
| "options": [ | |
| "A future state", | |
| "A value estimate", | |
| "A reward signal", | |
| "An action selection" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The policy outputs an action based on the current state." | |
| }, | |
| { | |
| "id": 28, | |
| "questionText": "Which of the following is true?", | |
| "options": [ | |
| "Discount factor selects actions", | |
| "Environment follows the agent’s rules", | |
| "Policy decides the action; environment gives the outcome", | |
| "Policy gives reward directly" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Policy → action. Environment → next state and reward." | |
| }, | |
| { | |
| "id": 29, | |
| "questionText": "Which kind of policy is more flexible and good for exploration?", | |
| "options": [ | |
| "Static policy", | |
| "Deterministic policy", | |
| "History-free policy", | |
| "Stochastic policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Stochastic policies are better for discovering new strategies via randomness." | |
| }, | |
| { | |
| "id": 30, | |
| "questionText": "A policy that does not change over time is called:", | |
| "options": [ | |
| "Optimal policy", | |
| "Static policy", | |
| "Gradient policy", | |
| "Adaptive policy" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "A static policy remains fixed throughout the learning process." | |
| }, | |
| { | |
| "id": 31, | |
| "questionText": "Which policy type is best suited for environments with high uncertainty or partially observable states?", | |
| "options": [ | |
| "Stochastic policy", | |
| "Deterministic policy", | |
| "Static policy", | |
| "Greedy policy" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Stochastic policies allow exploration and handle uncertainty better by assigning probabilities to actions." | |
| }, | |
| { | |
| "id": 32, | |
| "questionText": "In reinforcement learning, what does the policy π directly depend on?", | |
| "options": [ | |
| "Past rewards only", | |
| "Current state", | |
| "Network latency", | |
| "Future rewards only" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Policy π(a|s) maps the current state s to an action or probability distribution over actions." | |
| }, | |
| { | |
| "id": 33, | |
| "questionText": "A greedy policy always:", | |
| "options": [ | |
| "Avoids previously selected actions", | |
| "Explores equally among all actions", | |
| "Chooses the action with highest estimated value", | |
| "Chooses randomly" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Greedy policy exploits knowledge and always selects the best-known action." | |
| }, | |
| { | |
| "id": 34, | |
| "questionText": "What happens if a policy focuses only on short-term rewards?", | |
| "options": [ | |
| "It becomes myopic and may miss long-term gains", | |
| "It guarantees optimal performance", | |
| "It never learns from mistakes", | |
| "It requires no environment feedback" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "A short-sighted policy (myopic) becomes greedy and may miss high future rewards." | |
| }, | |
| { | |
| "id": 35, | |
| "questionText": "Which policy ensures a fixed mapping from state to action but does NOT involve randomness?", | |
| "options": [ | |
| "Stochastic policy", | |
| "Deterministic policy", | |
| "Exploratory policy", | |
| "Random policy" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Deterministic policy gives the same action every time for the same state." | |
| }, | |
| { | |
| "id": 36, | |
| "questionText": "An ε-greedy policy will choose a random action with probability:", | |
| "options": [ | |
| "1 − ε", | |
| "ε", | |
| "0", | |
| "Always" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "ε controls exploration; with probability ε, a random action is selected." | |
| }, | |
| { | |
| "id": 37, | |
| "questionText": "What is the purpose of policy improvement?", | |
| "options": [ | |
| "To reduce the number of actions", | |
| "To make the policy produce better long-term rewards", | |
| "To disable exploration", | |
| "To eliminate randomness" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Policy improvement updates the agent’s strategy toward higher future returns." | |
| }, | |
| { | |
| "id": 38, | |
| "questionText": "Which type of policy selects an action with a probability proportional to its value?", | |
| "options": [ | |
| "Static policy", | |
| "Deterministic policy", | |
| "Greedy policy", | |
| "Softmax policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Softmax policies assign action probabilities based on exponentiated value estimates." | |
| }, | |
| { | |
| "id": 39, | |
| "questionText": "In reinforcement learning, a policy defines:", | |
| "options": [ | |
| "The environment transitions", | |
| "The discount factor", | |
| "The reward assignment logic", | |
| "The rule for choosing actions" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "A policy completely governs how an agent selects actions in each state." | |
| }, | |
| { | |
| "id": 40, | |
| "questionText": "If a policy never explores new actions, it may:", | |
| "options": [ | |
| "Increase randomness over time", | |
| "Get stuck in a suboptimal behavior", | |
| "Automatically find the best policy", | |
| "Reset the environment state" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Without exploration, the agent may get stuck in local optimum solutions." | |
| }, | |
| { | |
| "id": 41, | |
| "questionText": "Which policy approach balances both exploitation and exploration?", | |
| "options": [ | |
| "Greedy policy", | |
| "Deterministic policy", | |
| "Static policy", | |
| "ε-greedy policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "ε-greedy selects the best-known action most of the time while still exploring occasionally." | |
| }, | |
| { | |
| "id": 42, | |
| "questionText": "Which policy is purely exploitation-based?", | |
| "options": [ | |
| "ε-greedy policy", | |
| "Softmax policy", | |
| "Stochastic policy", | |
| "Greedy policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Greedy policy always selects the current best action and does not explore." | |
| }, | |
| { | |
| "id": 43, | |
| "questionText": "A policy that does not change over time is called:", | |
| "options": [ | |
| "Adaptive policy", | |
| "Exploratory policy", | |
| "Parameterized policy", | |
| "Fixed or static policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Static policies remain unchanged during training." | |
| }, | |
| { | |
| "id": 44, | |
| "questionText": "Which type of policy is optimized directly in Policy Gradient methods?", | |
| "options": [ | |
| "Static policy", | |
| "Stochastic policy", | |
| "Rule-based policy", | |
| "Greedy policy" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Policy gradient algorithms operate directly on stochastic parameterized policies." | |
| }, | |
| { | |
| "id": 45, | |
| "questionText": "What is the core objective of optimal policy?", | |
| "options": [ | |
| "To reduce number of actions", | |
| "To minimize time spent in each state", | |
| "To maximize cumulative long-term reward", | |
| "To avoid terminal states" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The goal of optimal policy is to maximize expected discounted return over time." | |
| }, | |
| { | |
| "id": 46, | |
| "questionText": "Which policy type outputs a probability distribution over actions?", | |
| "options": [ | |
| "Static policy", | |
| "Deterministic policy", | |
| "Stochastic policy", | |
| "Greedy policy" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Stochastic policies output action probabilities rather than a fixed action." | |
| }, | |
| { | |
| "id": 47, | |
| "questionText": "Which situation requires stochastic policies over deterministic ones?", | |
| "options": [ | |
| "When multiple actions have equal rewards", | |
| "Reward is guaranteed every step", | |
| "No randomness is allowed", | |
| "Agent already knows full environment" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Stochastic policies prevent bias when actions are equally good." | |
| }, | |
| { | |
| "id": 48, | |
| "questionText": "Which policy ensures full exploration but no exploitation?", | |
| "options": [ | |
| "Deterministic policy", | |
| "Greedy policy", | |
| "Softmax policy", | |
| "Random policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Random policy entirely explores without considering reward values." | |
| }, | |
| { | |
| "id": 49, | |
| "questionText": "The policy improvement step checks:", | |
| "options": [ | |
| "Only immediate rewards", | |
| "If taking a better action increases long-term reward", | |
| "If randomness is eliminated", | |
| "The size of the replay buffer" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Policy improvement ensures updated actions lead to better expected return." | |
| }, | |
| { | |
| "id": 50, | |
| "questionText": "What happens if ε in ε-greedy policy is too high?", | |
| "options": [ | |
| "All rewards become zero", | |
| "Policy becomes deterministic", | |
| "Agent explores too much and learns slowly", | |
| "Agent gets stuck in exploitation" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Too high ε causes too much random exploration and slower learning." | |
| }, | |
| { | |
| "id": 51, | |
| "questionText": "What is the main disadvantage of a purely greedy policy?", | |
| "options": [ | |
| "It explores too much", | |
| "It may get stuck in a local optimum", | |
| "It ignores rewards completely", | |
| "It requires deep models only" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Greedy policy exploits only current best action and may miss higher long-term rewards." | |
| }, | |
| { | |
| "id": 52, | |
| "questionText": "Which policy is essential for exploration-exploitation balance in RL?", | |
| "options": [ | |
| "ε-greedy policy", | |
| "Greedy policy", | |
| "Random policy", | |
| "Static policy" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "ε-greedy smartly balances exploitation with occasional exploration." | |
| }, | |
| { | |
| "id": 53, | |
| "questionText": "Why are stochastic policies preferred over deterministic ones in partially observable environments?", | |
| "options": [ | |
| "They prevent overfitting to rewards.", | |
| "They improve exploration and robustness to uncertainty.", | |
| "They guarantee immediate reward.", | |
| "They reduce computation time." | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Stochastic policies help deal with uncertainty and enable natural exploration in partially observable settings." | |
| }, | |
| { | |
| "id": 54, | |
| "questionText": "Policy π(a|s) in RL typically represents:", | |
| "options": [ | |
| "Transition from state s to s'", | |
| "Value of the state", | |
| "Sum of discounted rewards", | |
| "Probability of selecting action a at state s" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "π(a|s) expresses how likely the agent is to take action a in state s." | |
| }, | |
| { | |
| "id": 55, | |
| "questionText": "What happens if ε is set to 0 in ε-greedy policy?", | |
| "options": [ | |
| "It stops learning", | |
| "It becomes fully random", | |
| "The policy becomes fully greedy", | |
| "It becomes softmax-based" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "ε = 0 means no exploration — pure greedy exploitation." | |
| }, | |
| { | |
| "id": 56, | |
| "questionText": "Softmax policy chooses actions based on:", | |
| "options": [ | |
| "Normalized exponentiated action values", | |
| "Fixed priority order", | |
| "Pure randomness", | |
| "Greedy selection only" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Softmax uses exponentiated Q-values and applies probability distribution." | |
| }, | |
| { | |
| "id": 57, | |
| "questionText": "A well-optimized policy should be:", | |
| "options": [ | |
| "Stable and yield maximum cumulative reward", | |
| "Constantly changing even after convergence", | |
| "Ignoring future rewards", | |
| "Completely random" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "An ideal policy should maximize cumulative return with stable decision-making." | |
| }, | |
| { | |
| "id": 58, | |
| "questionText": "Which policy is more suitable for early training?", | |
| "options": [ | |
| "Deterministic policy", | |
| "Greedy policy", | |
| "Static policy", | |
| "Stochastic or ε-greedy policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Stochastic or ε-greedy policies help explore environment during learning phase." | |
| }, | |
| { | |
| "id": 59, | |
| "questionText": "A deterministic policy π(s) returns:", | |
| "options": [ | |
| "A random action every time", | |
| "A probability distribution", | |
| "A single fixed action for state s", | |
| "The immediate reward" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Deterministic policy gives one specific action for each state." | |
| }, | |
| { | |
| "id": 60, | |
| "questionText": "What problem does the ε-greedy policy try to solve?", | |
| "options": [ | |
| "Reward scaling issue", | |
| "Learning rate adjustment", | |
| "Exploration vs exploitation dilemma", | |
| "Model overfitting" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "ε-greedy gives balance between trying new actions and exploiting known good ones." | |
| }, | |
| { | |
| "id": 61, | |
| "questionText": "A policy that adapts and improves as learning progresses is called:", | |
| "options": [ | |
| "Random policy", | |
| "Static policy", | |
| "Adaptive policy", | |
| "Greedy policy" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Adaptive policy updates itself over time for better performance." | |
| }, | |
| { | |
| "id": 62, | |
| "questionText": "If ε is too small in ε-greedy policy:", | |
| "options": [ | |
| "It stops acting on states", | |
| "Policy becomes unstable", | |
| "Agent keeps taking random actions", | |
| "The agent may not explore enough" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Too small ε leads to very less exploration — causing local optimum trap." | |
| }, | |
| { | |
| "id": 63, | |
| "questionText": "Which policy guarantees the best long-term reward theoretically?", | |
| "options": [ | |
| "Static policy", | |
| "Random policy", | |
| "Greedy policy", | |
| "Optimal policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The optimal policy is the mathematically best-performing policy." | |
| }, | |
| { | |
| "id": 64, | |
| "questionText": "Policy π is said to converge when:", | |
| "options": [ | |
| "Environment resets", | |
| "Rewards become zero", | |
| "Further updates do not improve its long-term return", | |
| "It stops selecting actions" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Convergence means policy has reached optimality and no longer changes significantly." | |
| }, | |
| { | |
| "id": 65, | |
| "questionText": "Which policy is best when actions are continuous, like turning a steering wheel?", | |
| "options": [ | |
| "Static policy", | |
| "Greedy-only policy", | |
| "Stochastic policy", | |
| "Greedy policy" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Stochastic policies can represent probability distributions over continuous action values." | |
| }, | |
| { | |
| "id": 66, | |
| "questionText": "Which policy type is commonly used in deep policy gradient methods?", | |
| "options": [ | |
| "Parameterized stochastic policy", | |
| "Static deterministic policy", | |
| "Greedy-only policy", | |
| "Pure random policy" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Policy gradient RL learns continuous parameterized stochastic policies." | |
| }, | |
| { | |
| "id": 67, | |
| "questionText": "What is the goal of an optimal policy π*?", | |
| "options": [ | |
| "To maximize the expected cumulative future reward", | |
| "To ensure equal action probability", | |
| "To ignore future outcomes", | |
| "To minimize state visitation" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Optimal policy focuses only on maximizing long-term discounted returns." | |
| }, | |
| { | |
| "id": 68, | |
| "questionText": "Which policy ensures highest theoretical performance but may be hard to compute?", | |
| "options": [ | |
| "Optimal policy π*", | |
| "Static policy", | |
| "Fixed greedy policy", | |
| "Random policy" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Optimal policy π* is mathematically best but hard to derive in complex environments." | |
| }, | |
| { | |
| "id": 69, | |
| "questionText": "When does ε decay in ε-greedy policy?", | |
| "options": [ | |
| "At the start of training only", | |
| "As training progresses to reduce randomness", | |
| "Only when reward is negative", | |
| "When the environment resets" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Exploration reduces over time by decaying ε — leading to more exploitation later." | |
| }, | |
| { | |
| "id": 70, | |
| "questionText": "Which policy is most suitable for a fully trained agent ready for deployment?", | |
| "options": [ | |
| "Random policy", | |
| "High ε policy", | |
| "Deterministic or greedy policy", | |
| "Static equal probability policy" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Once fully trained, deterministic greedy policies are ideal for stable deployment." | |
| }, | |
| { | |
| "id": 71, | |
| "questionText": "In a self-driving car RL system, a 'policy' is best described as:", | |
| "options": [ | |
| "A learned probability distribution over actions given states.", | |
| "A list of manually coded safety instructions.", | |
| "A fixed rule for mapping visual input to steering angle.", | |
| "A memory of all past successful trajectories." | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "A policy defines the agent’s behavior as a probability distribution of choosing actions conditioned on the current state." | |
| }, | |
| { | |
| "id": 72, | |
| "questionText": "Why are stochastic policies often preferred over deterministic ones in partially observable environments?", | |
| "options": [ | |
| "They guarantee immediate reward.", | |
| "They prevent overfitting to rewards.", | |
| "They reduce computation time.", | |
| "They improve exploration and robustness to uncertainty." | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Stochastic policies help deal with uncertainty and enable natural exploration in partially observable settings." | |
| }, | |
| { | |
| "id": 73, | |
| "questionText": "A policy π(a|s) outputs 0.9 for action A and 0.1 for action B. What does this imply?", | |
| "options": [ | |
| "Action A is always selected.", | |
| "Both actions have equal probability.", | |
| "Action B is always selected.", | |
| "Action A has higher likelihood of being chosen." | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The policy indicates the relative preference for taking actions based on their probabilities." | |
| }, | |
| { | |
| "id": 74, | |
| "questionText": "In policy-based RL algorithms, what is optimized directly?", | |
| "options": [ | |
| "The policy parameters.", | |
| "The value function.", | |
| "The reward function.", | |
| "The transition probability of environment states." | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Policy-based methods optimize the policy parameters directly to maximize expected reward." | |
| }, | |
| { | |
| "id": 75, | |
| "questionText": "Which statement is TRUE about greedy policies?", | |
| "options": [ | |
| "They always pick the action with highest estimated value.", | |
| "They ignore Q-values completely.", | |
| "They randomly choose between top two actions.", | |
| "They always balance exploration and exploitation." | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "A greedy policy always chooses the action with maximum current estimated value." | |
| }, | |
| { | |
| "id": 76, | |
| "questionText": "A trading bot based on RL keeps choosing the same profitable action and misses future better opportunities. Which failure is happening?", | |
| "options": [ | |
| "Exploration overflow", | |
| "Reward hacking", | |
| "State corruption", | |
| "Policy collapse due to over-exploitation" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Over-exploitation leads to policy collapse where the agent gets stuck in a suboptimal repeated action." | |
| }, | |
| { | |
| "id": 77, | |
| "questionText": "In a policy π(a|s), what does 'temperature' parameter control in Softmax-based action selection?", | |
| "options": [ | |
| "Memory retention rate", | |
| "Discount factor", | |
| "Learning rate of the critic", | |
| "Exploration randomness" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Temperature controls randomness — higher temperature means more exploration." | |
| }, | |
| { | |
| "id": 78, | |
| "questionText": "Why is a policy gradient method preferred in continuous-action environments like robotic arm movement?", | |
| "options": [ | |
| "It avoids function approximation.", | |
| "It doesn't need neural networks.", | |
| "It directly outputs continuous actions.", | |
| "It requires no reward function." | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Policy gradient methods can directly learn probability over continuous action spaces." | |
| }, | |
| { | |
| "id": 79, | |
| "questionText": "A policy outputs nearly equal probabilities for multiple actions at late-stage learning. What does this suggest?", | |
| "options": [ | |
| "Perfect optimal policy", | |
| "Overfitting to previous states", | |
| "High confidence in one action", | |
| "Underfitting or failure to converge" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Equal probabilities late in training indicates the agent has not converged or mastered action ranking." | |
| }, | |
| { | |
| "id": 80, | |
| "questionText": "Which is a danger of purely deterministic policies when facing adversarial agents?", | |
| "options": [ | |
| "Faster convergence", | |
| "Guaranteed optimal performance", | |
| "Improved reward stability", | |
| "Predictability and easy exploitation" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Deterministic policies can be exploited by adversarial agents due to full predictability." | |
| }, | |
| { | |
| "id": 81, | |
| "questionText": "In a warehouse robotic RL system, what does a well-designed policy ensure during real-time operation?", | |
| "options": [ | |
| "Maximum randomness in each movement", | |
| "Only reward-maximizing actions without safety checks", | |
| "Fixed pre-programmed path regardless of obstacles", | |
| "Consistent action-output given state while adapting to dynamics" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "A real-world policy must be consistent yet adaptive — fully deterministic is dangerous, fully random is useless." | |
| }, | |
| { | |
| "id": 82, | |
| "questionText": "A policy becomes overconfident in one action, ignoring safer alternatives. This is known as:", | |
| "options": [ | |
| "Reward normalization failure", | |
| "Policy entropy collapse", | |
| "Temporal instability", | |
| "Exploration dilation" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Low entropy implies the policy is stuck in one decision, risking catastrophic exploitation." | |
| }, | |
| { | |
| "id": 83, | |
| "questionText": "Why do autonomous drone RL policies avoid deterministic-only action selection?", | |
| "options": [ | |
| "It violates Bellman Optimality equation", | |
| "Determinism increases predictability against wind/adversaries", | |
| "Neural networks cannot output deterministic values", | |
| "Computational cost becomes infinite" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Adversarial weather or agents can exploit predictable actions — randomness protects against that." | |
| }, | |
| { | |
| "id": 84, | |
| "questionText": "A policy π outputs low probability for rare emergency actions. What is the RISK?", | |
| "options": [ | |
| "Higher computational cost", | |
| "Over-generalization", | |
| "Slower training convergence", | |
| "Fatal response delay in critical states" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Suppressing rare but critical safety actions can cause fatal real-time failures." | |
| }, | |
| { | |
| "id": 85, | |
| "questionText": "What does entropy regularization achieve in modern policy optimization?", | |
| "options": [ | |
| "Sets constant reward for all actions", | |
| "Forces agent to become deterministic", | |
| "Removes exploration entirely", | |
| "Discourages policy from overfitting to high-reward actions early" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Entropy regularization encourages exploration and prevents collapsing into premature deterministic behavior." | |
| }, | |
| { | |
| "id": 86, | |
| "questionText": "In large-scale financial RL trading, too stochastic a policy may cause:", | |
| "options": [ | |
| "Zero exploration", | |
| "Excessive random trading losses", | |
| "Complete policy collapse", | |
| "High overfitting to past trades" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Too much randomness causes excessive blind risk-taking in finance — balanced policy entropy is key." | |
| }, | |
| { | |
| "id": 87, | |
| "questionText": "Policy π(a|s) shifts radically between actions on each step even in identical states. This indicates:", | |
| "options": [ | |
| "Optimal deterministic convergence", | |
| "Exploration success", | |
| "Severe policy instability or oscillation", | |
| "Healthy dynamic learning" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Wildly changing action selection signals unstable or oscillating policy — unsafe in real-world RL." | |
| }, | |
| { | |
| "id": 88, | |
| "questionText": "An RL policy in medical treatment planning must prioritize which property MOST?", | |
| "options": [ | |
| "Maximum exploration", | |
| "Random therapy attempts", | |
| "Full reward-maximization regardless of side-effects", | |
| "Predictable and safe deterministic bias with edge-case backup" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Medical RL systems must prioritize safety + controlled determinism with controlled fallback." | |
| }, | |
| { | |
| "id": 89, | |
| "questionText": "Why do adversarial training environments use stochastic policies intentionally?", | |
| "options": [ | |
| "To make agent behavior unpredictable and robust", | |
| "To remove neural network computations", | |
| "To reduce model size", | |
| "To lock policy into single fixed action" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Stochasticity reduces exploitability — critical in adversarial environments." | |
| }, | |
| { | |
| "id": 90, | |
| "questionText": "A policy is said to be 'generalizable' when it:", | |
| "options": [ | |
| "Always explores instead of exploiting", | |
| "Only performs well on its training scenarios", | |
| "Relies on manual rule-based expert tuning", | |
| "Maintains stable performance across unseen states" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Generalizable policies can adapt and remain effective even in unseen or shifted state environments." | |
| }, | |
| { | |
| "id": 91, | |
| "questionText": "A reinforcement learning agent in an autonomous drone begins prioritizing fuel-saving over obstacle avoidance. What failure is occurring?", | |
| "options": [ | |
| "Network underfitting due to low capacity", | |
| "Reward misalignment leading to unsafe policy behavior", | |
| "Over-exploration of random states", | |
| "Incorrect Q-value normalization" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Reward misalignment leads the agent to learn dangerous shortcuts that violate safety constraints." | |
| }, | |
| { | |
| "id": 92, | |
| "questionText": "Which policy behavior is IDEAL for high-speed stock trading RL agents?", | |
| "options": [ | |
| "Completely deterministic for maximum consistency", | |
| "Policy that ignores market shifts", | |
| "Pure random policy", | |
| "Stochastic with controlled risk-awareness and adaptive confidence" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "A hybrid of deterministic stability and stochastic adaptability is ideal in rapidly shifting markets." | |
| }, | |
| { | |
| "id": 93, | |
| "questionText": "In human-in-the-loop medical RL systems, which policy issue is most unethical?", | |
| "options": [ | |
| "Occasionally being conservative", | |
| "Learning slower than expected", | |
| "Taking irreversible risky action without confidence or override", | |
| "Being slightly inefficient" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Irreversible high-risk actions without confidence or override are ethically forbidden in medical RL." | |
| }, | |
| { | |
| "id": 94, | |
| "questionText": "A robot vacuum cleaner learns a policy that avoids cleaning hard-to-reach corners to minimize energy use. What is happening?", | |
| "options": [ | |
| "Policy generalization improvement", | |
| "Reward hacking through exploitation loopholes", | |
| "Correct exploration behavior", | |
| "Perfect optimal cleaning policy" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Reward hacking — agent optimizes metric while ignoring true task objectives (corner cleaning)." | |
| }, | |
| { | |
| "id": 95, | |
| "questionText": "Why is a soft policy update (e.g., Polyak averaging) preferred over a hard update?", | |
| "options": [ | |
| "It makes the policy fully deterministic", | |
| "Reduces model size", | |
| "Increases reward immediately", | |
| "Prevents destabilizing sudden policy jumps" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Soft updates stabilize learning by gradually blending new policies into existing ones." | |
| }, | |
| { | |
| "id": 96, | |
| "questionText": "Which real-world failure BEST explains why RL policies must include fallback or rejection mechanisms?", | |
| "options": [ | |
| "Agent faces unseen scenario and takes catastrophic action", | |
| "Reward is slightly lower than expected", | |
| "Agent reaches optimal convergence", | |
| "Robot repeats successful action" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Unseen scenario + overconfident wrong action = critical failure → fallback mechanisms are mandatory." | |
| }, | |
| { | |
| "id": 97, | |
| "questionText": "A self-driving car RL policy begins intentionally taking risky shortcuts to win racing simulations. What is this problem called?", | |
| "options": [ | |
| "Reward hacking / specification gaming", | |
| "State abstraction conflict", | |
| "Policy lag", | |
| "Exploration decay failure" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Reward hacking occurs when agent finds loopholes in reward design — optimizing metric but violating goal intentions." | |
| }, | |
| { | |
| "id": 98, | |
| "questionText": "A policy must prioritize __________ ABOVE raw reward maximization in life-critical RL applications.", | |
| "options": [ | |
| "Hard-coded deterministic action", | |
| "Fastest reward achievement", | |
| "Exploration rate tuning", | |
| "Safety and long-term stability" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "In life-critical environments, safety & reliability are priority over short-term reward spikes." | |
| }, | |
| { | |
| "id": 99, | |
| "questionText": "What is the key reason modern RL policy models use entropy terms during early training?", | |
| "options": [ | |
| "Reduce network size", | |
| "Encourage broader exploration to avoid premature policy locking", | |
| "Guarantee immediate reward", | |
| "Force complete determinism" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Entropy prevents collapse into suboptimal deterministic policy too early during training." | |
| }, | |
| { | |
| "id": 100, | |
| "questionText": "In a policy deployment for military RL drones, what is considered NON-NEGOTIABLE?", | |
| "options": [ | |
| "Maximum exploration for new tactics", | |
| "Full removal of human supervision", | |
| "Absolute controllability and override priority over agent autonomy", | |
| "Allowing autonomous lethal decisions" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Override authority and controllability are mandatory — autonomy is allowed but never absolute in critical RL systems." | |
| } | |
| ] | |
| } | |