Spaces:

deedrop1140
/

MachineLearningAlgorithms

Running

File size: 45,032 Bytes

0d00d62

{
  "title": "Reinforcement Learning Policy Mastery: 100 MCQs",
  "description": "A comprehensive set of 100 multiple-choice questions covering RL Policies, including actions, policy types (deterministic, stochastic, greedy, ε-greedy), optimization, exploration vs. exploitation, and real-world scenarios.",
  "questions": [
    {
      "id": 1,
      "questionText": "In reinforcement learning, what is an action?",
      "options": [
        "A fixed rule of the environment",
        "A numerical reward signal",
        "A choice made by the agent to interact with the environment",
        "Information received from the environment"
      ],
      "correctAnswerIndex": 2,
      "explanation": "An action is the decision taken by the agent to affect the environment."
    },
    {
      "id": 2,
      "questionText": "What is a policy in reinforcement learning?",
      "options": [
        "A mapping from states to actions",
        "A sequence of states",
        "The transition probability",
        "The total reward received"
      ],
      "correctAnswerIndex": 0,
      "explanation": "A policy is a strategy that decides which action to take in each state."
    },
    {
      "id": 3,
      "questionText": "A policy is denoted mathematically as:",
      "options": [
        "γ",
        "R(s)",
        "π(a|s)",
        "P(s’|s,a)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "π(a|s) defines the probability of taking action a when in state s."
    },
    {
      "id": 4,
      "questionText": "A deterministic policy always:",
      "options": [
        "Selects the same action for a given state",
        "Depends only on reward",
        "Selects random actions",
        "Changes over time"
      ],
      "correctAnswerIndex": 0,
      "explanation": "A deterministic policy selects the same action every time for a specific state."
    },
    {
      "id": 5,
      "questionText": "A stochastic policy:",
      "options": [
        "Does not involve randomness",
        "Ignores the current state",
        "Gives a probability distribution over actions",
        "Always selects the same action"
      ],
      "correctAnswerIndex": 2,
      "explanation": "A stochastic policy gives probabilities for each possible action."
    },
    {
      "id": 6,
      "questionText": "Which of the following represents an action set?",
      "options": [
        "History of transitions",
        "All possible states in the environment",
        "All rewards received",
        "All possible decisions the agent can make at a state"
      ],
      "correctAnswerIndex": 3,
      "explanation": "The action set is the list of all actions the agent can choose."
    },
    {
      "id": 7,
      "questionText": "A policy helps the agent to:",
      "options": [
        "Modify the environment",
        "Decide which action to take",
        "Terminate learning immediately",
        "Change the reward function"
      ],
      "correctAnswerIndex": 1,
      "explanation": "A policy directly guides the agent’s decisions at each state."
    },
    {
      "id": 8,
      "questionText": "An optimal policy aims to:",
      "options": [
        "Only get immediate rewards",
        "Avoid terminal states",
        "Maximize long-term cumulative reward",
        "Minimize actions taken"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Optimal policies aim to maximize cumulative return, not just immediate rewards."
    },
    {
      "id": 9,
      "questionText": "Which symbol is commonly used for policy?",
      "options": [
        "λ (lambda)",
        "δ (delta)",
        "π (pi)",
        "μ (mu)"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Policy is conventionally represented as π in reinforcement learning literature."
    },
    {
      "id": 10,
      "questionText": "In reinforcement learning, actions are chosen based on:",
      "options": [
        "Fixed environment rules",
        "The current policy",
        "Random guessing",
        "The reward only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "The agent follows its policy to choose an action in each state."
    },
    {
      "id": 11,
      "questionText": "A policy π(a|s) defines:",
      "options": [
        "The reward received after an action",
        "The probability of taking action a in state s",
        "The discount factor",
        "The next state transition"
      ],
      "correctAnswerIndex": 1,
      "explanation": "π(a|s) gives the probability that the agent selects action a when it is in state s."
    },
    {
      "id": 12,
      "questionText": "Which type of policy is commonly used during exploration?",
      "options": [
        "Deterministic policy",
        "Static policy",
        "Stochastic policy",
        "Greedy policy"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Stochastic policies allow randomness and help the agent to explore better."
    },
    {
      "id": 13,
      "questionText": "What does a greedy policy do?",
      "options": [
        "Always selects the action with the highest estimated value",
        "Selects random actions",
        "Avoids high rewards",
        "Maximizes long-term regret"
      ],
      "correctAnswerIndex": 0,
      "explanation": "A greedy policy always selects the action currently believed to give the highest reward."
    },
    {
      "id": 14,
      "questionText": "The ε-greedy policy:",
      "options": [
        "Always exploits the best action",
        "Balances exploration and exploitation",
        "Never selects random actions",
        "Explores only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "ε-greedy policy chooses the best action most of the time, but sometimes explores randomly."
    },
    {
      "id": 15,
      "questionText": "A policy directly controls:",
      "options": [
        "How rewards are given",
        "The discount factor",
        "Which action the agent selects",
        "The transition probability"
      ],
      "correctAnswerIndex": 2,
      "explanation": "The policy is responsible for selecting which action to take in each state."
    },
    {
      "id": 16,
      "questionText": "Which of the following is true for a deterministic policy?",
      "options": [
        "It randomly selects an action",
        "It maps each state to exactly one action",
        "It gives probabilities for actions",
        "It has no control over actions"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Deterministic policy chooses exactly one action for a given state."
    },
    {
      "id": 17,
      "questionText": "In reinforcement learning, the agent follows a policy to:",
      "options": [
        "Generate rewards",
        "Stop learning",
        "Define terminal states",
        "Decide actions"
      ],
      "correctAnswerIndex": 3,
      "explanation": "The policy guides the agent in selecting actions during interaction."
    },
    {
      "id": 18,
      "questionText": "A policy that selects actions purely based on current reward and ignores future rewards is:",
      "options": [
        "Myopic",
        "Long-term",
        "Optimal",
        "Model-based"
      ],
      "correctAnswerIndex": 0,
      "explanation": "A myopic policy only focuses on immediate rewards and not the future."
    },
    {
      "id": 19,
      "questionText": "Which of the following best describes the role of a policy?",
      "options": [
        "It stores previous rewards",
        "It defines the agent’s behavior strategy",
        "It calculates final reward only",
        "It controls environment dynamics"
      ],
      "correctAnswerIndex": 1,
      "explanation": "A policy is the strategy that the agent uses to decide what action to take."
    },
    {
      "id": 20,
      "questionText": "What is the goal of policy optimization?",
      "options": [
        "To eliminate all randomness",
        "To improve the policy for higher long-term rewards",
        "To reduce the number of states",
        "To modify the reward function"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Policy optimization aims to improve decision-making for maximum cumulative reward."
    },
    {
      "id": 21,
      "questionText": "Which type of policy is preferred during exploitation?",
      "options": [
        "Exploratory",
        "Myopic",
        "Fully random",
        "Deterministic"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Exploitation prefers deterministic behavior for consistent maximum return."
    },
    {
      "id": 22,
      "questionText": "What does ε represent in ε-greedy policy?",
      "options": [
        "Learning rate",
        "Probability of exploration",
        "State transition cost",
        "Discount factor"
      ],
      "correctAnswerIndex": 1,
      "explanation": "ε controls how often the agent explores randomly instead of exploiting."
    },
    {
      "id": 23,
      "questionText": "A policy that changes and improves over time is called:",
      "options": [
        "Fixed policy",
        "Adaptive policy",
        "Terminal policy",
        "Static policy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Adaptive policies evolve with learning and improve performance over time."
    },
    {
      "id": 24,
      "questionText": "Which algorithm often improves policy iteratively?",
      "options": [
        "Clustering",
        "Bubble sort",
        "Policy gradient",
        "Binary search"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Policy gradient is a reinforcement learning method specifically designed to optimize policies."
    },
    {
      "id": 25,
      "questionText": "The policy essentially forms the agent’s:",
      "options": [
        "Memory buffer",
        "Reward system",
        "Behavior strategy",
        "Environment model"
      ],
      "correctAnswerIndex": 2,
      "explanation": "The policy is the behavior strategy used to choose actions."
    },
    {
      "id": 26,
      "questionText": "Which policy guarantees the same output for each particular input state?",
      "options": [
        "Random policy",
        "Stochastic policy",
        "Deterministic policy",
        "Exploration-only policy"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Deterministic policies always map one state to one fixed action."
    },
    {
      "id": 27,
      "questionText": "What is the output of a policy?",
      "options": [
        "A future state",
        "A value estimate",
        "A reward signal",
        "An action selection"
      ],
      "correctAnswerIndex": 3,
      "explanation": "The policy outputs an action based on the current state."
    },
    {
      "id": 28,
      "questionText": "Which of the following is true?",
      "options": [
        "Discount factor selects actions",
        "Environment follows the agent’s rules",
        "Policy decides the action; environment gives the outcome",
        "Policy gives reward directly"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Policy → action. Environment → next state and reward."
    },
    {
      "id": 29,
      "questionText": "Which kind of policy is more flexible and good for exploration?",
      "options": [
        "Static policy",
        "Deterministic policy",
        "History-free policy",
        "Stochastic policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Stochastic policies are better for discovering new strategies via randomness."
    },
    {
      "id": 30,
      "questionText": "A policy that does not change over time is called:",
      "options": [
        "Optimal policy",
        "Static policy",
        "Gradient policy",
        "Adaptive policy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "A static policy remains fixed throughout the learning process."
    },
    {
      "id": 31,
      "questionText": "Which policy type is best suited for environments with high uncertainty or partially observable states?",
      "options": [
        "Stochastic policy",
        "Deterministic policy",
        "Static policy",
        "Greedy policy"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Stochastic policies allow exploration and handle uncertainty better by assigning probabilities to actions."
    },
    {
      "id": 32,
      "questionText": "In reinforcement learning, what does the policy π directly depend on?",
      "options": [
        "Past rewards only",
        "Current state",
        "Network latency",
        "Future rewards only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Policy π(a|s) maps the current state s to an action or probability distribution over actions."
    },
    {
      "id": 33,
      "questionText": "A greedy policy always:",
      "options": [
        "Avoids previously selected actions",
        "Explores equally among all actions",
        "Chooses the action with highest estimated value",
        "Chooses randomly"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Greedy policy exploits knowledge and always selects the best-known action."
    },
    {
      "id": 34,
      "questionText": "What happens if a policy focuses only on short-term rewards?",
      "options": [
        "It becomes myopic and may miss long-term gains",
        "It guarantees optimal performance",
        "It never learns from mistakes",
        "It requires no environment feedback"
      ],
      "correctAnswerIndex": 0,
      "explanation": "A short-sighted policy (myopic) becomes greedy and may miss high future rewards."
    },
    {
      "id": 35,
      "questionText": "Which policy ensures a fixed mapping from state to action but does NOT involve randomness?",
      "options": [
        "Stochastic policy",
        "Deterministic policy",
        "Exploratory policy",
        "Random policy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Deterministic policy gives the same action every time for the same state."
    },
    {
      "id": 36,
      "questionText": "An ε-greedy policy will choose a random action with probability:",
      "options": [
        "1 − ε",
        "ε",
        "0",
        "Always"
      ],
      "correctAnswerIndex": 1,
      "explanation": "ε controls exploration; with probability ε, a random action is selected."
    },
    {
      "id": 37,
      "questionText": "What is the purpose of policy improvement?",
      "options": [
        "To reduce the number of actions",
        "To make the policy produce better long-term rewards",
        "To disable exploration",
        "To eliminate randomness"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Policy improvement updates the agent’s strategy toward higher future returns."
    },
    {
      "id": 38,
      "questionText": "Which type of policy selects an action with a probability proportional to its value?",
      "options": [
        "Static policy",
        "Deterministic policy",
        "Greedy policy",
        "Softmax policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Softmax policies assign action probabilities based on exponentiated value estimates."
    },
    {
      "id": 39,
      "questionText": "In reinforcement learning, a policy defines:",
      "options": [
        "The environment transitions",
        "The discount factor",
        "The reward assignment logic",
        "The rule for choosing actions"
      ],
      "correctAnswerIndex": 3,
      "explanation": "A policy completely governs how an agent selects actions in each state."
    },
    {
      "id": 40,
      "questionText": "If a policy never explores new actions, it may:",
      "options": [
        "Increase randomness over time",
        "Get stuck in a suboptimal behavior",
        "Automatically find the best policy",
        "Reset the environment state"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Without exploration, the agent may get stuck in local optimum solutions."
    },
    {
      "id": 41,
      "questionText": "Which policy approach balances both exploitation and exploration?",
      "options": [
        "Greedy policy",
        "Deterministic policy",
        "Static policy",
        "ε-greedy policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "ε-greedy selects the best-known action most of the time while still exploring occasionally."
    },
    {
      "id": 42,
      "questionText": "Which policy is purely exploitation-based?",
      "options": [
        "ε-greedy policy",
        "Softmax policy",
        "Stochastic policy",
        "Greedy policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Greedy policy always selects the current best action and does not explore."
    },
    {
      "id": 43,
      "questionText": "A policy that does not change over time is called:",
      "options": [
        "Adaptive policy",
        "Exploratory policy",
        "Parameterized policy",
        "Fixed or static policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Static policies remain unchanged during training."
    },
    {
      "id": 44,
      "questionText": "Which type of policy is optimized directly in Policy Gradient methods?",
      "options": [
        "Static policy",
        "Stochastic policy",
        "Rule-based policy",
        "Greedy policy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Policy gradient algorithms operate directly on stochastic parameterized policies."
    },
    {
      "id": 45,
      "questionText": "What is the core objective of optimal policy?",
      "options": [
        "To reduce number of actions",
        "To minimize time spent in each state",
        "To maximize cumulative long-term reward",
        "To avoid terminal states"
      ],
      "correctAnswerIndex": 2,
      "explanation": "The goal of optimal policy is to maximize expected discounted return over time."
    },
    {
      "id": 46,
      "questionText": "Which policy type outputs a probability distribution over actions?",
      "options": [
        "Static policy",
        "Deterministic policy",
        "Stochastic policy",
        "Greedy policy"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Stochastic policies output action probabilities rather than a fixed action."
    },
    {
      "id": 47,
      "questionText": "Which situation requires stochastic policies over deterministic ones?",
      "options": [
        "When multiple actions have equal rewards",
        "Reward is guaranteed every step",
        "No randomness is allowed",
        "Agent already knows full environment"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Stochastic policies prevent bias when actions are equally good."
    },
    {
      "id": 48,
      "questionText": "Which policy ensures full exploration but no exploitation?",
      "options": [
        "Deterministic policy",
        "Greedy policy",
        "Softmax policy",
        "Random policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Random policy entirely explores without considering reward values."
    },
    {
      "id": 49,
      "questionText": "The policy improvement step checks:",
      "options": [
        "Only immediate rewards",
        "If taking a better action increases long-term reward",
        "If randomness is eliminated",
        "The size of the replay buffer"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Policy improvement ensures updated actions lead to better expected return."
    },
    {
      "id": 50,
      "questionText": "What happens if ε in ε-greedy policy is too high?",
      "options": [
        "All rewards become zero",
        "Policy becomes deterministic",
        "Agent explores too much and learns slowly",
        "Agent gets stuck in exploitation"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Too high ε causes too much random exploration and slower learning."
    },
    {
      "id": 51,
      "questionText": "What is the main disadvantage of a purely greedy policy?",
      "options": [
        "It explores too much",
        "It may get stuck in a local optimum",
        "It ignores rewards completely",
        "It requires deep models only"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Greedy policy exploits only current best action and may miss higher long-term rewards."
    },
    {
      "id": 52,
      "questionText": "Which policy is essential for exploration-exploitation balance in RL?",
      "options": [
        "ε-greedy policy",
        "Greedy policy",
        "Random policy",
        "Static policy"
      ],
      "correctAnswerIndex": 0,
      "explanation": "ε-greedy smartly balances exploitation with occasional exploration."
    },
    {
      "id": 53,
      "questionText": "Why are stochastic policies preferred over deterministic ones in partially observable environments?",
      "options": [
        "They prevent overfitting to rewards.",
        "They improve exploration and robustness to uncertainty.",
        "They guarantee immediate reward.",
        "They reduce computation time."
      ],
      "correctAnswerIndex": 1,
      "explanation": "Stochastic policies help deal with uncertainty and enable natural exploration in partially observable settings."
    },
    {
      "id": 54,
      "questionText": "Policy π(a|s) in RL typically represents:",
      "options": [
        "Transition from state s to s'",
        "Value of the state",
        "Sum of discounted rewards",
        "Probability of selecting action a at state s"
      ],
      "correctAnswerIndex": 3,
      "explanation": "π(a|s) expresses how likely the agent is to take action a in state s."
    },
    {
      "id": 55,
      "questionText": "What happens if ε is set to 0 in ε-greedy policy?",
      "options": [
        "It stops learning",
        "It becomes fully random",
        "The policy becomes fully greedy",
        "It becomes softmax-based"
      ],
      "correctAnswerIndex": 2,
      "explanation": "ε = 0 means no exploration — pure greedy exploitation."
    },
    {
      "id": 56,
      "questionText": "Softmax policy chooses actions based on:",
      "options": [
        "Normalized exponentiated action values",
        "Fixed priority order",
        "Pure randomness",
        "Greedy selection only"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Softmax uses exponentiated Q-values and applies probability distribution."
    },
    {
      "id": 57,
      "questionText": "A well-optimized policy should be:",
      "options": [
        "Stable and yield maximum cumulative reward",
        "Constantly changing even after convergence",
        "Ignoring future rewards",
        "Completely random"
      ],
      "correctAnswerIndex": 0,
      "explanation": "An ideal policy should maximize cumulative return with stable decision-making."
    },
    {
      "id": 58,
      "questionText": "Which policy is more suitable for early training?",
      "options": [
        "Deterministic policy",
        "Greedy policy",
        "Static policy",
        "Stochastic or ε-greedy policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Stochastic or ε-greedy policies help explore environment during learning phase."
    },
    {
      "id": 59,
      "questionText": "A deterministic policy π(s) returns:",
      "options": [
        "A random action every time",
        "A probability distribution",
        "A single fixed action for state s",
        "The immediate reward"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Deterministic policy gives one specific action for each state."
    },
    {
      "id": 60,
      "questionText": "What problem does the ε-greedy policy try to solve?",
      "options": [
        "Reward scaling issue",
        "Learning rate adjustment",
        "Exploration vs exploitation dilemma",
        "Model overfitting"
      ],
      "correctAnswerIndex": 2,
      "explanation": "ε-greedy gives balance between trying new actions and exploiting known good ones."
    },
    {
      "id": 61,
      "questionText": "A policy that adapts and improves as learning progresses is called:",
      "options": [
        "Random policy",
        "Static policy",
        "Adaptive policy",
        "Greedy policy"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Adaptive policy updates itself over time for better performance."
    },
    {
      "id": 62,
      "questionText": "If ε is too small in ε-greedy policy:",
      "options": [
        "It stops acting on states",
        "Policy becomes unstable",
        "Agent keeps taking random actions",
        "The agent may not explore enough"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Too small ε leads to very less exploration — causing local optimum trap."
    },
    {
      "id": 63,
      "questionText": "Which policy guarantees the best long-term reward theoretically?",
      "options": [
        "Static policy",
        "Random policy",
        "Greedy policy",
        "Optimal policy"
      ],
      "correctAnswerIndex": 3,
      "explanation": "The optimal policy is the mathematically best-performing policy."
    },
    {
      "id": 64,
      "questionText": "Policy π is said to converge when:",
      "options": [
        "Environment resets",
        "Rewards become zero",
        "Further updates do not improve its long-term return",
        "It stops selecting actions"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Convergence means policy has reached optimality and no longer changes significantly."
    },
    {
      "id": 65,
      "questionText": "Which policy is best when actions are continuous, like turning a steering wheel?",
      "options": [
        "Static policy",
        "Greedy-only policy",
        "Stochastic policy",
        "Greedy policy"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Stochastic policies can represent probability distributions over continuous action values."
    },
    {
      "id": 66,
      "questionText": "Which policy type is commonly used in deep policy gradient methods?",
      "options": [
        "Parameterized stochastic policy",
        "Static deterministic policy",
        "Greedy-only policy",
        "Pure random policy"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Policy gradient RL learns continuous parameterized stochastic policies."
    },
    {
      "id": 67,
      "questionText": "What is the goal of an optimal policy π*?",
      "options": [
        "To maximize the expected cumulative future reward",
        "To ensure equal action probability",
        "To ignore future outcomes",
        "To minimize state visitation"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Optimal policy focuses only on maximizing long-term discounted returns."
    },
    {
      "id": 68,
      "questionText": "Which policy ensures highest theoretical performance but may be hard to compute?",
      "options": [
        "Optimal policy π*",
        "Static policy",
        "Fixed greedy policy",
        "Random policy"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Optimal policy π* is mathematically best but hard to derive in complex environments."
    },
    {
      "id": 69,
      "questionText": "When does ε decay in ε-greedy policy?",
      "options": [
        "At the start of training only",
        "As training progresses to reduce randomness",
        "Only when reward is negative",
        "When the environment resets"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Exploration reduces over time by decaying ε — leading to more exploitation later."
    },
    {
      "id": 70,
      "questionText": "Which policy is most suitable for a fully trained agent ready for deployment?",
      "options": [
        "Random policy",
        "High ε policy",
        "Deterministic or greedy policy",
        "Static equal probability policy"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Once fully trained, deterministic greedy policies are ideal for stable deployment."
    },
    {
      "id": 71,
      "questionText": "In a self-driving car RL system, a 'policy' is best described as:",
      "options": [
        "A learned probability distribution over actions given states.",
        "A list of manually coded safety instructions.",
        "A fixed rule for mapping visual input to steering angle.",
        "A memory of all past successful trajectories."
      ],
      "correctAnswerIndex": 0,
      "explanation": "A policy defines the agent’s behavior as a probability distribution of choosing actions conditioned on the current state."
    },
    {
      "id": 72,
      "questionText": "Why are stochastic policies often preferred over deterministic ones in partially observable environments?",
      "options": [
        "They guarantee immediate reward.",
        "They prevent overfitting to rewards.",
        "They reduce computation time.",
        "They improve exploration and robustness to uncertainty."
      ],
      "correctAnswerIndex": 3,
      "explanation": "Stochastic policies help deal with uncertainty and enable natural exploration in partially observable settings."
    },
    {
      "id": 73,
      "questionText": "A policy π(a|s) outputs 0.9 for action A and 0.1 for action B. What does this imply?",
      "options": [
        "Action A is always selected.",
        "Both actions have equal probability.",
        "Action B is always selected.",
        "Action A has higher likelihood of being chosen."
      ],
      "correctAnswerIndex": 3,
      "explanation": "The policy indicates the relative preference for taking actions based on their probabilities."
    },
    {
      "id": 74,
      "questionText": "In policy-based RL algorithms, what is optimized directly?",
      "options": [
        "The policy parameters.",
        "The value function.",
        "The reward function.",
        "The transition probability of environment states."
      ],
      "correctAnswerIndex": 0,
      "explanation": "Policy-based methods optimize the policy parameters directly to maximize expected reward."
    },
    {
      "id": 75,
      "questionText": "Which statement is TRUE about greedy policies?",
      "options": [
        "They always pick the action with highest estimated value.",
        "They ignore Q-values completely.",
        "They randomly choose between top two actions.",
        "They always balance exploration and exploitation."
      ],
      "correctAnswerIndex": 0,
      "explanation": "A greedy policy always chooses the action with maximum current estimated value."
    },
    {
      "id": 76,
      "questionText": "A trading bot based on RL keeps choosing the same profitable action and misses future better opportunities. Which failure is happening?",
      "options": [
        "Exploration overflow",
        "Reward hacking",
        "State corruption",
        "Policy collapse due to over-exploitation"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Over-exploitation leads to policy collapse where the agent gets stuck in a suboptimal repeated action."
    },
    {
      "id": 77,
      "questionText": "In a policy π(a|s), what does 'temperature' parameter control in Softmax-based action selection?",
      "options": [
        "Memory retention rate",
        "Discount factor",
        "Learning rate of the critic",
        "Exploration randomness"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Temperature controls randomness — higher temperature means more exploration."
    },
    {
      "id": 78,
      "questionText": "Why is a policy gradient method preferred in continuous-action environments like robotic arm movement?",
      "options": [
        "It avoids function approximation.",
        "It doesn't need neural networks.",
        "It directly outputs continuous actions.",
        "It requires no reward function."
      ],
      "correctAnswerIndex": 2,
      "explanation": "Policy gradient methods can directly learn probability over continuous action spaces."
    },
    {
      "id": 79,
      "questionText": "A policy outputs nearly equal probabilities for multiple actions at late-stage learning. What does this suggest?",
      "options": [
        "Perfect optimal policy",
        "Overfitting to previous states",
        "High confidence in one action",
        "Underfitting or failure to converge"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Equal probabilities late in training indicates the agent has not converged or mastered action ranking."
    },
    {
      "id": 80,
      "questionText": "Which is a danger of purely deterministic policies when facing adversarial agents?",
      "options": [
        "Faster convergence",
        "Guaranteed optimal performance",
        "Improved reward stability",
        "Predictability and easy exploitation"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Deterministic policies can be exploited by adversarial agents due to full predictability."
    },
    {
      "id": 81,
      "questionText": "In a warehouse robotic RL system, what does a well-designed policy ensure during real-time operation?",
      "options": [
        "Maximum randomness in each movement",
        "Only reward-maximizing actions without safety checks",
        "Fixed pre-programmed path regardless of obstacles",
        "Consistent action-output given state while adapting to dynamics"
      ],
      "correctAnswerIndex": 3,
      "explanation": "A real-world policy must be consistent yet adaptive — fully deterministic is dangerous, fully random is useless."
    },
    {
      "id": 82,
      "questionText": "A policy becomes overconfident in one action, ignoring safer alternatives. This is known as:",
      "options": [
        "Reward normalization failure",
        "Policy entropy collapse",
        "Temporal instability",
        "Exploration dilation"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Low entropy implies the policy is stuck in one decision, risking catastrophic exploitation."
    },
    {
      "id": 83,
      "questionText": "Why do autonomous drone RL policies avoid deterministic-only action selection?",
      "options": [
        "It violates Bellman Optimality equation",
        "Determinism increases predictability against wind/adversaries",
        "Neural networks cannot output deterministic values",
        "Computational cost becomes infinite"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Adversarial weather or agents can exploit predictable actions — randomness protects against that."
    },
    {
      "id": 84,
      "questionText": "A policy π outputs low probability for rare emergency actions. What is the RISK?",
      "options": [
        "Higher computational cost",
        "Over-generalization",
        "Slower training convergence",
        "Fatal response delay in critical states"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Suppressing rare but critical safety actions can cause fatal real-time failures."
    },
    {
      "id": 85,
      "questionText": "What does entropy regularization achieve in modern policy optimization?",
      "options": [
        "Sets constant reward for all actions",
        "Forces agent to become deterministic",
        "Removes exploration entirely",
        "Discourages policy from overfitting to high-reward actions early"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Entropy regularization encourages exploration and prevents collapsing into premature deterministic behavior."
    },
    {
      "id": 86,
      "questionText": "In large-scale financial RL trading, too stochastic a policy may cause:",
      "options": [
        "Zero exploration",
        "Excessive random trading losses",
        "Complete policy collapse",
        "High overfitting to past trades"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Too much randomness causes excessive blind risk-taking in finance — balanced policy entropy is key."
    },
    {
      "id": 87,
      "questionText": "Policy π(a|s) shifts radically between actions on each step even in identical states. This indicates:",
      "options": [
        "Optimal deterministic convergence",
        "Exploration success",
        "Severe policy instability or oscillation",
        "Healthy dynamic learning"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Wildly changing action selection signals unstable or oscillating policy — unsafe in real-world RL."
    },
    {
      "id": 88,
      "questionText": "An RL policy in medical treatment planning must prioritize which property MOST?",
      "options": [
        "Maximum exploration",
        "Random therapy attempts",
        "Full reward-maximization regardless of side-effects",
        "Predictable and safe deterministic bias with edge-case backup"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Medical RL systems must prioritize safety + controlled determinism with controlled fallback."
    },
    {
      "id": 89,
      "questionText": "Why do adversarial training environments use stochastic policies intentionally?",
      "options": [
        "To make agent behavior unpredictable and robust",
        "To remove neural network computations",
        "To reduce model size",
        "To lock policy into single fixed action"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Stochasticity reduces exploitability — critical in adversarial environments."
    },
    {
      "id": 90,
      "questionText": "A policy is said to be 'generalizable' when it:",
      "options": [
        "Always explores instead of exploiting",
        "Only performs well on its training scenarios",
        "Relies on manual rule-based expert tuning",
        "Maintains stable performance across unseen states"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Generalizable policies can adapt and remain effective even in unseen or shifted state environments."
    },
    {
      "id": 91,
      "questionText": "A reinforcement learning agent in an autonomous drone begins prioritizing fuel-saving over obstacle avoidance. What failure is occurring?",
      "options": [
        "Network underfitting due to low capacity",
        "Reward misalignment leading to unsafe policy behavior",
        "Over-exploration of random states",
        "Incorrect Q-value normalization"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Reward misalignment leads the agent to learn dangerous shortcuts that violate safety constraints."
    },
    {
      "id": 92,
      "questionText": "Which policy behavior is IDEAL for high-speed stock trading RL agents?",
      "options": [
        "Completely deterministic for maximum consistency",
        "Policy that ignores market shifts",
        "Pure random policy",
        "Stochastic with controlled risk-awareness and adaptive confidence"
      ],
      "correctAnswerIndex": 3,
      "explanation": "A hybrid of deterministic stability and stochastic adaptability is ideal in rapidly shifting markets."
    },
    {
      "id": 93,
      "questionText": "In human-in-the-loop medical RL systems, which policy issue is most unethical?",
      "options": [
        "Occasionally being conservative",
        "Learning slower than expected",
        "Taking irreversible risky action without confidence or override",
        "Being slightly inefficient"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Irreversible high-risk actions without confidence or override are ethically forbidden in medical RL."
    },
    {
      "id": 94,
      "questionText": "A robot vacuum cleaner learns a policy that avoids cleaning hard-to-reach corners to minimize energy use. What is happening?",
      "options": [
        "Policy generalization improvement",
        "Reward hacking through exploitation loopholes",
        "Correct exploration behavior",
        "Perfect optimal cleaning policy"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Reward hacking — agent optimizes metric while ignoring true task objectives (corner cleaning)."
    },
    {
      "id": 95,
      "questionText": "Why is a soft policy update (e.g., Polyak averaging) preferred over a hard update?",
      "options": [
        "It makes the policy fully deterministic",
        "Reduces model size",
        "Increases reward immediately",
        "Prevents destabilizing sudden policy jumps"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Soft updates stabilize learning by gradually blending new policies into existing ones."
    },
    {
      "id": 96,
      "questionText": "Which real-world failure BEST explains why RL policies must include fallback or rejection mechanisms?",
      "options": [
        "Agent faces unseen scenario and takes catastrophic action",
        "Reward is slightly lower than expected",
        "Agent reaches optimal convergence",
        "Robot repeats successful action"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Unseen scenario + overconfident wrong action = critical failure → fallback mechanisms are mandatory."
    },
    {
      "id": 97,
      "questionText": "A self-driving car RL policy begins intentionally taking risky shortcuts to win racing simulations. What is this problem called?",
      "options": [
        "Reward hacking / specification gaming",
        "State abstraction conflict",
        "Policy lag",
        "Exploration decay failure"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reward hacking occurs when agent finds loopholes in reward design — optimizing metric but violating goal intentions."
    },
    {
      "id": 98,
      "questionText": "A policy must prioritize __________ ABOVE raw reward maximization in life-critical RL applications.",
      "options": [
        "Hard-coded deterministic action",
        "Fastest reward achievement",
        "Exploration rate tuning",
        "Safety and long-term stability"
      ],
      "correctAnswerIndex": 3,
      "explanation": "In life-critical environments, safety & reliability are priority over short-term reward spikes."
    },
    {
      "id": 99,
      "questionText": "What is the key reason modern RL policy models use entropy terms during early training?",
      "options": [
        "Reduce network size",
        "Encourage broader exploration to avoid premature policy locking",
        "Guarantee immediate reward",
        "Force complete determinism"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Entropy prevents collapse into suboptimal deterministic policy too early during training."
    },
    {
      "id": 100,
      "questionText": "In a policy deployment for military RL drones, what is considered NON-NEGOTIABLE?",
      "options": [
        "Maximum exploration for new tactics",
        "Full removal of human supervision",
        "Absolute controllability and override priority over agent autonomy",
        "Allowing autonomous lethal decisions"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Override authority and controllability are mandatory — autonomy is allowed but never absolute in critical RL systems."
    }
  ]
}