| { | |
| "title": "Reinforcement Learning Basics Mastery: 100 MCQs", | |
| "description": "A comprehensive set of 100 multiple-choice questions covering the fundamental concepts of Reinforcement Learning, including agent, environment, state, action, reward, policy, value functions, and exploration vs. exploitation.", | |
| "questions": [ | |
| { | |
| "id": 1, | |
| "questionText": "In reinforcement learning, what is an agent?", | |
| "options": [ | |
| "The reward signal", | |
| "The entity that interacts with the environment and learns from feedback", | |
| "A state of the system", | |
| "The environment itself" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "An agent is the learner or decision maker that takes actions in an environment and receives feedback (rewards) to improve its behavior." | |
| }, | |
| { | |
| "id": 2, | |
| "questionText": "What does the environment represent in reinforcement learning?", | |
| "options": [ | |
| "The reward signal only", | |
| "The agent itself", | |
| "Everything the agent interacts with, including states and rules", | |
| "The action space only" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The environment includes everything external to the agent, defining possible states, transitions, and feedback." | |
| }, | |
| { | |
| "id": 3, | |
| "questionText": "What is a 'state' in the context of reinforcement learning?", | |
| "options": [ | |
| "A description of the current situation of the environment", | |
| "The agent's action choice", | |
| "The policy function", | |
| "The reward received" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "A state captures all relevant information about the environment at a specific time, which the agent can use to decide its next action." | |
| }, | |
| { | |
| "id": 4, | |
| "questionText": "Which of the following is true about the interaction between agent and environment?", | |
| "options": [ | |
| "Agent observes state, takes action, environment returns next state and reward", | |
| "Agent only observes rewards", | |
| "Environment takes action, agent returns state", | |
| "Environment updates agent’s policy directly" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The standard reinforcement learning loop: the agent observes a state, acts, and receives a reward and new state from the environment." | |
| }, | |
| { | |
| "id": 5, | |
| "questionText": "What is the purpose of the reward signal?", | |
| "options": [ | |
| "To provide feedback to the agent on the quality of its action", | |
| "To define the environment", | |
| "To choose the next action automatically", | |
| "To represent the current state" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The reward signal guides the agent, helping it learn which actions are beneficial or harmful." | |
| }, | |
| { | |
| "id": 6, | |
| "questionText": "What does a fully observable environment mean?", | |
| "options": [ | |
| "The agent knows only past rewards", | |
| "The agent cannot observe the state", | |
| "The agent has complete information about the current state", | |
| "The environment chooses the agent's actions" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "In a fully observable environment, the agent can see all relevant aspects of the current state to make optimal decisions." | |
| }, | |
| { | |
| "id": 7, | |
| "questionText": "What is a partially observable environment?", | |
| "options": [ | |
| "The agent has incomplete information about the current state", | |
| "Rewards are constant", | |
| "The agent observes everything", | |
| "The environment changes randomly without states" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "In partially observable environments, the agent must act based on incomplete or noisy observations." | |
| }, | |
| { | |
| "id": 8, | |
| "questionText": "Which of the following defines the agent's behavior?", | |
| "options": [ | |
| "Reward", | |
| "Policy", | |
| "State", | |
| "Environment" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "The policy is a mapping from states to actions that defines the agent’s behavior." | |
| }, | |
| { | |
| "id": 9, | |
| "questionText": "What is the action space?", | |
| "options": [ | |
| "The environment dynamics", | |
| "All possible states in the environment", | |
| "The set of reward values", | |
| "The set of all actions an agent can take in a given state" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The action space contains all the actions available to the agent at any point in time." | |
| }, | |
| { | |
| "id": 10, | |
| "questionText": "What is the difference between deterministic and stochastic environments?", | |
| "options": [ | |
| "There is no difference", | |
| "Deterministic: no rewards; Stochastic: rewards exist", | |
| "Deterministic: random actions; Stochastic: fixed actions", | |
| "Deterministic: same action leads to same outcome; Stochastic: action may lead to different outcomes" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "In deterministic environments, actions have predictable results. In stochastic environments, outcomes may vary probabilistically." | |
| }, | |
| { | |
| "id": 11, | |
| "questionText": "Which of the following is an example of an agent in real life?", | |
| "options": [ | |
| "The maze itself", | |
| "The state of the environment", | |
| "A robot navigating a maze", | |
| "A reward signal" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The agent is the decision-making entity that interacts with the environment, e.g., a robot in a maze." | |
| }, | |
| { | |
| "id": 12, | |
| "questionText": "Which of the following is an example of a state in a grid-world environment?", | |
| "options": [ | |
| "The reward function", | |
| "Agent’s current position on the grid", | |
| "The policy", | |
| "The set of all possible actions" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "A state represents the agent’s current location and situation in the environment." | |
| }, | |
| { | |
| "id": 13, | |
| "questionText": "In reinforcement learning, what is the transition function?", | |
| "options": [ | |
| "Specifies the policy", | |
| "Defines probability of moving from one state to another given an action", | |
| "Defines the reward only", | |
| "Specifies the agent" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "The transition function determines how the environment evolves in response to the agent's actions." | |
| }, | |
| { | |
| "id": 14, | |
| "questionText": "What does a Markov Decision Process (MDP) assume about states?", | |
| "options": [ | |
| "States are not related", | |
| "The future depends on all past states", | |
| "Rewards are ignored", | |
| "The future state depends only on the current state and action" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The Markov property assumes that the next state depends only on the current state and action." | |
| }, | |
| { | |
| "id": 15, | |
| "questionText": "Which component is NOT part of a standard reinforcement learning framework?", | |
| "options": [ | |
| "Agent", | |
| "Environment", | |
| "Loss function (not always explicit)", | |
| "Reward signal" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The loss function is part of optimization in some algorithms, but not a fundamental RL component." | |
| }, | |
| { | |
| "id": 16, | |
| "questionText": "What is the observation in reinforcement learning?", | |
| "options": [ | |
| "The action chosen by the agent", | |
| "The policy itself", | |
| "The reward only", | |
| "The information received by the agent from the environment" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Observation is what the agent perceives about the environment at each time step." | |
| }, | |
| { | |
| "id": 17, | |
| "questionText": "In an episodic task, what marks the end of an episode?", | |
| "options": [ | |
| "The agent stops observing", | |
| "A terminal state is reached", | |
| "The reward is zero", | |
| "The environment resets automatically" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "An episode ends when the agent reaches a terminal state, after which learning may continue in a new episode." | |
| }, | |
| { | |
| "id": 18, | |
| "questionText": "Which of the following best describes the policy function π(s)?", | |
| "options": [ | |
| "A mapping from actions to rewards", | |
| "The transition function", | |
| "A mapping from states to probabilities of actions", | |
| "The environment model" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The policy π(s) defines the probability distribution over actions for each state." | |
| }, | |
| { | |
| "id": 19, | |
| "questionText": "What is a terminal state?", | |
| "options": [ | |
| "The starting state", | |
| "A state with no available actions only", | |
| "Any state with a negative reward", | |
| "A state where the episode ends and no further actions are taken" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Terminal states signal the end of an episode in reinforcement learning." | |
| }, | |
| { | |
| "id": 20, | |
| "questionText": "Which component captures the dynamics of how actions affect the environment?", | |
| "options": [ | |
| "Reward function only", | |
| "Agent", | |
| "Policy function", | |
| "Transition function" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The transition function defines how the environment responds to the agent's actions." | |
| }, | |
| { | |
| "id": 21, | |
| "questionText": "In a stochastic environment, taking the same action twice may result in:", | |
| "options": [ | |
| "Different next states and rewards", | |
| "Deterministic rewards only", | |
| "No change in environment", | |
| "Always the same next state" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Stochastic environments have probabilistic outcomes, so the same action may lead to different states and rewards." | |
| }, | |
| { | |
| "id": 22, | |
| "questionText": "Which of the following is an example of an environment in reinforcement learning?", | |
| "options": [ | |
| "The policy function", | |
| "The robot itself", | |
| "A chessboard on which a robot plays", | |
| "The reward function only" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The environment includes the world the agent interacts with, such as the chessboard in a robot chess task." | |
| }, | |
| { | |
| "id": 23, | |
| "questionText": "Which type of environment requires the agent to infer hidden states?", | |
| "options": [ | |
| "Deterministic environment", | |
| "Fully observable environment", | |
| "Partially observable environment", | |
| "Episodic environment" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "In partially observable environments, the agent cannot directly observe the true state and must infer it from observations." | |
| }, | |
| { | |
| "id": 24, | |
| "questionText": "Which of the following is a key challenge for agents in large state spaces?", | |
| "options": [ | |
| "Small reward", | |
| "Fully observable environment", | |
| "Curse of dimensionality", | |
| "Terminal states" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Large state spaces make it difficult for agents to explore and learn efficiently, a problem known as the curse of dimensionality." | |
| }, | |
| { | |
| "id": 25, | |
| "questionText": "Which signal tells the agent how good an action was?", | |
| "options": [ | |
| "Action", | |
| "Policy", | |
| "Reward", | |
| "State" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The reward is feedback from the environment that indicates the quality of an action." | |
| }, | |
| { | |
| "id": 26, | |
| "questionText": "Which of the following is true for a Markov environment?", | |
| "options": [ | |
| "The future depends only on the current state and action", | |
| "The state space is continuous only", | |
| "Rewards are always zero", | |
| "The future depends on all past actions" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Markov environments satisfy the Markov property, where the next state depends only on the current state and action." | |
| }, | |
| { | |
| "id": 27, | |
| "questionText": "Which component allows the agent to explore and learn optimal behavior?", | |
| "options": [ | |
| "Policy and reward feedback", | |
| "Transition function only", | |
| "State only", | |
| "Environment only" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The agent uses its policy and reward feedback to explore and improve actions." | |
| }, | |
| { | |
| "id": 28, | |
| "questionText": "In an RL problem, what is an episodic task?", | |
| "options": [ | |
| "Task without any terminal state", | |
| "Task with only one action", | |
| "Task with clearly defined episodes ending in terminal states", | |
| "Task without rewards" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Episodic tasks consist of episodes with start and terminal states; each episode is independent." | |
| }, | |
| { | |
| "id": 29, | |
| "questionText": "What is the role of exploration in reinforcement learning?", | |
| "options": [ | |
| "Observing states without acting", | |
| "Always exploiting known rewards", | |
| "Ignoring the environment", | |
| "Trying new actions to discover their effects" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Exploration allows the agent to learn about unknown states and actions, balancing exploration and exploitation." | |
| }, | |
| { | |
| "id": 30, | |
| "questionText": "Which statement describes the agent-environment loop?", | |
| "options": [ | |
| "Agent observes reward only", | |
| "Environment observes agent → acts → gives state", | |
| "Agent observes state → takes action → receives reward → observes next state", | |
| "Environment updates policy directly" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The agent-environment loop is the core of reinforcement learning, where the agent interacts with the environment through actions and feedback." | |
| }, | |
| { | |
| "id": 31, | |
| "questionText": "What distinguishes a deterministic policy from a stochastic policy?", | |
| "options": [ | |
| "Deterministic policy has rewards; stochastic does not", | |
| "Deterministic policy selects a specific action for a state; stochastic policy assigns probabilities to actions", | |
| "Deterministic policy assigns probabilities; stochastic policy selects a fixed action", | |
| "Both are the same" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Deterministic policies always select a specific action for a state, whereas stochastic policies assign a probability distribution over possible actions." | |
| }, | |
| { | |
| "id": 32, | |
| "questionText": "In a partially observable environment, what does the agent often maintain to act effectively?", | |
| "options": [ | |
| "Fixed action list", | |
| "Reward history only", | |
| "Transition function only", | |
| "Belief state representing probability distribution over possible true states" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Belief states allow the agent to infer the true state when observations are incomplete or noisy." | |
| }, | |
| { | |
| "id": 33, | |
| "questionText": "Which type of environment is an agent guaranteed to learn an optimal policy with enough exploration?", | |
| "options": [ | |
| "Non-Markov environment", | |
| "Continuous reward environment only", | |
| "Partially observable environment", | |
| "Markov environment" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "In Markov environments, the future depends only on the current state and action, making learning optimal policies feasible." | |
| }, | |
| { | |
| "id": 34, | |
| "questionText": "What is the advantage of modeling an environment as an MDP?", | |
| "options": [ | |
| "Removes need for rewards", | |
| "Reduces the state space automatically", | |
| "Guarantees deterministic transitions", | |
| "Enables mathematical analysis and algorithmic solution for optimal policies" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "MDPs provide a formal framework to model the agent-environment interaction mathematically, facilitating policy optimization." | |
| }, | |
| { | |
| "id": 35, | |
| "questionText": "Which of the following best describes a reward function R(s,a,s')?", | |
| "options": [ | |
| "Indicates terminal state only", | |
| "Predicts the next state", | |
| "Defines the policy", | |
| "Gives the immediate reward for transitioning from state s to s' using action a" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The reward function assigns a scalar value indicating the immediate benefit of taking an action in a state and transitioning to the next state." | |
| }, | |
| { | |
| "id": 36, | |
| "questionText": "Which of the following is true about terminal states in episodic tasks?", | |
| "options": [ | |
| "The agent restarts immediately without learning", | |
| "Terminal states do not exist in episodic tasks", | |
| "Rewards become infinite", | |
| "Once reached, no further actions are taken in the episode" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Terminal states mark the end of an episode; learning continues by updating values or policies before starting a new episode." | |
| }, | |
| { | |
| "id": 37, | |
| "questionText": "Which is an example of continuous state space?", | |
| "options": [ | |
| "Number of books on a shelf", | |
| "Robot’s position in a 2D plane with real coordinates", | |
| "A traffic light with three colors", | |
| "A chessboard with 64 squares" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Continuous state spaces have infinitely many possible states, such as positions represented by real numbers." | |
| }, | |
| { | |
| "id": 38, | |
| "questionText": "What does the exploration-exploitation trade-off mean?", | |
| "options": [ | |
| "Choosing between reward and punishment", | |
| "Choosing between trying new actions (exploration) and using known best actions (exploitation)", | |
| "Choosing between deterministic and stochastic policies", | |
| "Choosing between discrete and continuous states" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Agents must balance exploring new actions to learn about the environment and exploiting known actions to maximize rewards." | |
| }, | |
| { | |
| "id": 39, | |
| "questionText": "Which term describes a mapping from a state to an action probability distribution?", | |
| "options": [ | |
| "Transition function", | |
| "Deterministic policy", | |
| "Reward function", | |
| "Stochastic policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "A stochastic policy assigns probabilities to possible actions in a given state." | |
| }, | |
| { | |
| "id": 40, | |
| "questionText": "Which statement is true about the state transition function P(s'|s,a)?", | |
| "options": [ | |
| "It defines the policy", | |
| "It measures agent’s performance", | |
| "It defines the probability of reaching state s' from state s using action a", | |
| "It gives the immediate reward only" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The transition function models the environment’s dynamics probabilistically, specifying how actions lead to next states." | |
| }, | |
| { | |
| "id": 41, | |
| "questionText": "Which of the following is NOT a property of Markov Decision Processes?", | |
| "options": [ | |
| "Transition probabilities exist for all actions", | |
| "Rewards depend only on current state and action", | |
| "Next state depends only on current state and action", | |
| "Agent history is needed to determine next state" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "MDPs satisfy the Markov property: the next state depends only on the current state and action, not the full history." | |
| }, | |
| { | |
| "id": 42, | |
| "questionText": "In reinforcement learning, what is a value function?", | |
| "options": [ | |
| "Determines the next action directly", | |
| "Gives immediate reward only", | |
| "Estimates expected cumulative reward from a state or state-action pair", | |
| "Represents environment transitions" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Value functions estimate how good it is to be in a state or take an action, guiding the agent toward optimal behavior." | |
| }, | |
| { | |
| "id": 43, | |
| "questionText": "What is Q-learning primarily used for?", | |
| "options": [ | |
| "Learning optimal action-value function without model of environment", | |
| "Defining transition probabilities", | |
| "Calculating rewards directly", | |
| "Predicting future states only" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Q-learning is a model-free reinforcement learning algorithm that learns the expected cumulative reward (Q-values) for state-action pairs." | |
| }, | |
| { | |
| "id": 44, | |
| "questionText": "Which of the following is a characteristic of a reward signal?", | |
| "options": [ | |
| "A policy mapping", | |
| "A transition probability", | |
| "Scalar feedback indicating desirability of action outcome", | |
| "Vector describing all states" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Rewards provide scalar feedback to guide the agent toward favorable actions." | |
| }, | |
| { | |
| "id": 45, | |
| "questionText": "Which type of environment requires the agent to maintain memory of past actions to perform well?", | |
| "options": [ | |
| "Episodic tasks", | |
| "Partially observable environments", | |
| "Discrete environments only", | |
| "Fully observable deterministic environments" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "In partially observable environments, the agent may need to use past actions or observations to infer the current state." | |
| }, | |
| { | |
| "id": 46, | |
| "questionText": "Which of the following represents a stochastic reward?", | |
| "options": [ | |
| "The reward is always zero", | |
| "The reward is always 1", | |
| "The reward depends deterministically on the state only", | |
| "The reward received for the same action may vary probabilistically" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Stochastic rewards introduce randomness, making the same action yield different outcomes in different trials." | |
| }, | |
| { | |
| "id": 47, | |
| "questionText": "What is the main challenge in large continuous state spaces?", | |
| "options": [ | |
| "Rewards are absent", | |
| "Terminal states do not exist", | |
| "Policies are deterministic only", | |
| "Efficiently representing and learning value or policy functions" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Continuous and high-dimensional states require function approximation techniques to represent policies and value functions efficiently." | |
| }, | |
| { | |
| "id": 48, | |
| "questionText": "Which statement describes a fully observable episodic environment?", | |
| "options": [ | |
| "Environment never resets", | |
| "Agent can see entire state and each episode ends at a terminal state", | |
| "Agent receives no rewards", | |
| "Agent cannot observe state" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Fully observable episodic environments allow the agent to see complete states and episodes terminate at terminal states." | |
| }, | |
| { | |
| "id": 49, | |
| "questionText": "Which is a common approach to handle partial observability?", | |
| "options": [ | |
| "Using deterministic rewards only", | |
| "Maintaining a belief state or memory of observations", | |
| "Always resetting the environment", | |
| "Ignoring the missing information" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Agents can maintain belief states or use history to make decisions in partially observable environments." | |
| }, | |
| { | |
| "id": 50, | |
| "questionText": "Which of the following is an example of an action in a reinforcement learning problem?", | |
| "options": [ | |
| "Moving a robot forward", | |
| "The robot’s position", | |
| "The environment itself", | |
| "The reward value" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Actions are choices made by the agent to interact with the environment." | |
| }, | |
| { | |
| "id": 51, | |
| "questionText": "What is the purpose of a policy evaluation step in reinforcement learning?", | |
| "options": [ | |
| "To estimate the value function for a given policy", | |
| "To reset the environment", | |
| "To define the transition function", | |
| "To select the next action directly" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Policy evaluation calculates how good a given policy is by computing expected cumulative rewards (value function) for states or state-action pairs." | |
| }, | |
| { | |
| "id": 52, | |
| "questionText": "Which of the following is true for model-free reinforcement learning?", | |
| "options": [ | |
| "States are fully deterministic", | |
| "Agent learns value or policy without knowing environment transitions", | |
| "Agent knows all transition probabilities in advance", | |
| "Rewards are not used" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Model-free methods learn from interaction and feedback without requiring a model of the environment’s dynamics." | |
| }, | |
| { | |
| "id": 53, | |
| "questionText": "Which of the following best defines the state-value function V(s)?", | |
| "options": [ | |
| "Expected cumulative reward from state s following a given policy", | |
| "Immediate reward of state s", | |
| "Set of possible actions", | |
| "Probability of transitioning to next state" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "V(s) estimates how good it is to be in a particular state under a specific policy." | |
| }, | |
| { | |
| "id": 54, | |
| "questionText": "Which best defines the action-value function Q(s,a)?", | |
| "options": [ | |
| "Expected cumulative reward from taking action a in state s following a given policy", | |
| "Immediate reward for state s only", | |
| "Policy mapping for s only", | |
| "Probability of reaching terminal state" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Q(s,a) estimates the expected return for taking an action in a state and following the policy thereafter." | |
| }, | |
| { | |
| "id": 55, | |
| "questionText": "In a stochastic environment, the optimal policy may be:", | |
| "options": [ | |
| "Independent of rewards", | |
| "Always deterministic", | |
| "Stochastic", | |
| "Always random" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "In stochastic environments, an optimal policy may assign probabilities to actions to maximize expected returns." | |
| }, | |
| { | |
| "id": 56, | |
| "questionText": "Which term describes the probability distribution over next states given current state and action?", | |
| "options": [ | |
| "Reward function R(s,a)", | |
| "Policy π(s)", | |
| "Transition function P(s'|s,a)", | |
| "Value function V(s)" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The transition function defines the environment dynamics in terms of probabilities for next states." | |
| }, | |
| { | |
| "id": 57, | |
| "questionText": "Which type of task has no terminal state?", | |
| "options": [ | |
| "Episodic task", | |
| "Fully observable task", | |
| "Partially observable task", | |
| "Continuing task" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "In continuing tasks, the agent interacts with the environment indefinitely without episodes ending in terminal states." | |
| }, | |
| { | |
| "id": 58, | |
| "questionText": "Which of the following statements is true about discounted rewards?", | |
| "options": [ | |
| "Rewards are ignored", | |
| "Future rewards are multiplied by a discount factor gamma (0 ≤ γ ≤ 1)", | |
| "Only immediate rewards are considered", | |
| "All rewards are summed without discount" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Discounted rewards give less weight to future rewards, balancing immediate and long-term gains." | |
| }, | |
| { | |
| "id": 59, | |
| "questionText": "Which approach can handle large or continuous state spaces?", | |
| "options": [ | |
| "Terminal states only", | |
| "Function approximation (like neural networks)", | |
| "Ignoring rewards", | |
| "Tabular methods only" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Function approximation allows RL algorithms to scale to large or continuous states by generalizing across similar states." | |
| }, | |
| { | |
| "id": 60, | |
| "questionText": "Which statement is true about on-policy learning?", | |
| "options": [ | |
| "Agent learns using a different policy than the one it follows", | |
| "Policy is deterministic only", | |
| "Agent learns value or policy using the same policy it follows to act", | |
| "Agent does not consider rewards" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "On-policy methods update value or policy estimates based on the policy currently being followed by the agent." | |
| }, | |
| { | |
| "id": 61, | |
| "questionText": "Which statement describes off-policy learning?", | |
| "options": [ | |
| "Requires fully observable environment only", | |
| "Learning only immediate rewards", | |
| "Learning about one policy while following another policy", | |
| "Ignoring actions" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Off-policy methods allow the agent to evaluate or improve a policy different from the behavior policy used to generate data." | |
| }, | |
| { | |
| "id": 62, | |
| "questionText": "Which of the following can improve exploration in RL?", | |
| "options": [ | |
| "Reducing state space only", | |
| "Adding randomness to action selection (e.g., ε-greedy)", | |
| "Always selecting the best-known action", | |
| "Ignoring rewards" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Exploration strategies like ε-greedy help the agent discover better actions instead of always exploiting known ones." | |
| }, | |
| { | |
| "id": 63, | |
| "questionText": "Which scenario represents a partially observable environment?", | |
| "options": [ | |
| "Moving pieces on a completely known board", | |
| "A fixed deterministic reward system", | |
| "A robot navigating a room with occluded areas", | |
| "A chess game with full board visible" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Partial observability occurs when the agent cannot directly perceive the full state of the environment." | |
| }, | |
| { | |
| "id": 64, | |
| "questionText": "Which of the following is a key benefit of defining MDPs?", | |
| "options": [ | |
| "Eliminates the need for rewards", | |
| "Provides a formal structure for planning and learning optimal policies", | |
| "Reduces state space automatically", | |
| "Removes stochasticity" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "MDPs formalize the problem so agents can use algorithms like dynamic programming, Monte Carlo, or Q-learning to optimize policies." | |
| }, | |
| { | |
| "id": 65, | |
| "questionText": "Which of the following represents a terminal reward?", | |
| "options": [ | |
| "Random reward at any state", | |
| "Reward at every step", | |
| "A reward received upon reaching a terminal state", | |
| "Reward ignored in final step" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Terminal rewards are obtained only when the agent reaches a terminal state, signaling the end of an episode." | |
| }, | |
| { | |
| "id": 66, | |
| "questionText": "Which of the following is true about value iteration?", | |
| "options": [ | |
| "Estimates reward only", | |
| "Computes optimal value function iteratively using Bellman optimality equation", | |
| "Ignores transition probabilities", | |
| "Updates policy randomly" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Value iteration is a dynamic programming algorithm that iteratively updates state values to converge to the optimal value function." | |
| }, | |
| { | |
| "id": 67, | |
| "questionText": "Which approach can solve RL problems without knowing the transition function?", | |
| "options": [ | |
| "Value iteration with model", | |
| "Policy evaluation with model", | |
| "Model-free reinforcement learning", | |
| "Dynamic programming" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Model-free RL methods learn directly from experience without requiring knowledge of environment dynamics." | |
| }, | |
| { | |
| "id": 68, | |
| "questionText": "Which component of RL represents the agent’s knowledge about the environment?", | |
| "options": [ | |
| "Terminal state", | |
| "State only", | |
| "Value function or Q-function", | |
| "Action space only" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Value functions or Q-functions encode the agent's learned knowledge about which states or actions are desirable." | |
| }, | |
| { | |
| "id": 69, | |
| "questionText": "Which of the following best describes a reward shaping technique?", | |
| "options": [ | |
| "Altering the action space only", | |
| "Ignoring exploration", | |
| "Modifying the reward function to accelerate learning", | |
| "Changing the state space" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Reward shaping adds additional rewards to guide the agent more efficiently without changing the optimal policy." | |
| }, | |
| { | |
| "id": 70, | |
| "questionText": "Which type of RL problem is suitable for continuing tasks?", | |
| "options": [ | |
| "Tasks with ongoing interactions and no terminal state", | |
| "Episodic tasks only", | |
| "Tasks with fixed-length episodes only", | |
| "Tasks with stochastic rewards ignored" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Continuing tasks involve ongoing interaction with no defined terminal states, such as monitoring a robot continuously." | |
| }, | |
| { | |
| "id": 71, | |
| "questionText": "A robot is navigating a warehouse with moving obstacles. Which type of environment does it face?", | |
| "options": [ | |
| "Fully observable and stochastic", | |
| "Partially observable and deterministic", | |
| "Partially observable and stochastic", | |
| "Fully observable and deterministic" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The robot cannot always see all obstacles (partial observability) and the movement of obstacles is unpredictable (stochastic)." | |
| }, | |
| { | |
| "id": 72, | |
| "questionText": "An agent receives rewards only at the end of a maze. Which learning challenge is most significant?", | |
| "options": [ | |
| "Partial observability", | |
| "Continuous state representation", | |
| "Delayed reward problem", | |
| "Exploration-exploitation trade-off" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "When rewards are delayed, the agent must figure out which actions contributed to eventual success, making learning harder." | |
| }, | |
| { | |
| "id": 73, | |
| "questionText": "A drone must decide its path to maximize battery life and avoid collisions. Which components are crucial for its RL model?", | |
| "options": [ | |
| "Reward function only", | |
| "Terminal state only", | |
| "State (position, velocity), actions (move directions), rewards (safety + battery efficiency)", | |
| "Policy only" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "The drone’s model needs full state representation, available actions, and a reward function capturing multiple objectives." | |
| }, | |
| { | |
| "id": 74, | |
| "questionText": "A game AI observes opponent moves but cannot see hidden cards. Which property describes its environment?", | |
| "options": [ | |
| "Partially observable", | |
| "Deterministic", | |
| "Fully observable", | |
| "Episodic only" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The agent has incomplete information due to hidden cards, making the environment partially observable." | |
| }, | |
| { | |
| "id": 75, | |
| "questionText": "In a stock trading simulation, the agent sees only past prices and indicators. What RL challenge is this?", | |
| "options": [ | |
| "Deterministic policy", | |
| "Partial observability and delayed reward", | |
| "Terminal state only", | |
| "Immediate rewards only" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "The agent must infer the hidden market state from past observations and often receives profit/loss rewards after some delay." | |
| }, | |
| { | |
| "id": 76, | |
| "questionText": "A robot has sensors with noise. Which approach can help it act reliably?", | |
| "options": [ | |
| "Use deterministic rewards only", | |
| "Act randomly always", | |
| "Maintain belief state or use filtering techniques", | |
| "Ignore sensor readings" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Belief states or filters help the agent estimate the true state from noisy observations." | |
| }, | |
| { | |
| "id": 77, | |
| "questionText": "In a maze with traps, the agent must minimize risk while reaching the goal. Which reward design is best?", | |
| "options": [ | |
| "Positive reward for reaching goal, negative reward for traps", | |
| "Rewards for every step only", | |
| "Random rewards everywhere", | |
| "No reward at all" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Assigning penalties for traps encourages safe paths while rewarding goal completion guides the agent." | |
| }, | |
| { | |
| "id": 78, | |
| "questionText": "A robot learns by observing a human performing a task first. Which type of RL method does this resemble?", | |
| "options": [ | |
| "Value iteration only", | |
| "Random exploration only", | |
| "Model-free Q-learning only", | |
| "Imitation learning / guided RL" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Imitation learning leverages demonstrations to guide the agent before independent reinforcement learning." | |
| }, | |
| { | |
| "id": 79, | |
| "questionText": "In an autonomous car simulation, the agent sees other cars’ positions and velocities. Which component represents this information?", | |
| "options": [ | |
| "State", | |
| "Policy", | |
| "Action", | |
| "Reward" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The state captures the current situation the agent observes, including positions and velocities of other cars." | |
| }, | |
| { | |
| "id": 80, | |
| "questionText": "A reinforcement learning agent must act in a changing environment. Which feature is critical?", | |
| "options": [ | |
| "Terminal states only", | |
| "Fixed rewards only", | |
| "Adaptive policy that can update over time", | |
| "Static value function only" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "In non-stationary environments, the agent must adapt its policy based on new observations and feedback." | |
| }, | |
| { | |
| "id": 81, | |
| "questionText": "In a game where rewards are sparse, which strategy improves learning efficiency?", | |
| "options": [ | |
| "Always taking the first available action", | |
| "Ignoring delayed rewards", | |
| "Reward shaping to provide intermediate feedback", | |
| "Reducing state space arbitrarily" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Reward shaping provides intermediate signals, helping the agent learn even when main rewards are rare." | |
| }, | |
| { | |
| "id": 82, | |
| "questionText": "A delivery drone must optimize routes while avoiding bad weather. What is a suitable reward structure?", | |
| "options": [ | |
| "Random reward for actions", | |
| "Positive for deliveries, negative for entering bad weather zones", | |
| "Reward only for flight time", | |
| "Reward constant at all states" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "The reward structure guides the agent to balance speed and safety." | |
| }, | |
| { | |
| "id": 83, | |
| "questionText": "Which type of environment is represented by a multi-agent traffic simulation?", | |
| "options": [ | |
| "Episodic only", | |
| "Single-agent deterministic", | |
| "Fully observable and deterministic", | |
| "Partially observable and stochastic" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Each agent sees limited information and other agents behave unpredictably, making the environment partially observable and stochastic." | |
| }, | |
| { | |
| "id": 84, | |
| "questionText": "An agent in a factory can perform actions that affect multiple machines. Which is critical for modeling this RL problem?", | |
| "options": [ | |
| "State should include all relevant machine statuses", | |
| "Terminal state alone is sufficient", | |
| "Only the reward matters", | |
| "Policy is unnecessary" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "To make informed decisions, the agent must have a state representation capturing all relevant aspects of the environment." | |
| }, | |
| { | |
| "id": 85, | |
| "questionText": "Which challenge arises in partially observable environments with stochastic transitions?", | |
| "options": [ | |
| "Terminal states disappear", | |
| "Actions are deterministic only", | |
| "The agent must infer hidden states and handle uncertainty simultaneously", | |
| "Rewards are always zero" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Partial observability and stochasticity require advanced strategies like belief state tracking to make good decisions." | |
| }, | |
| { | |
| "id": 86, | |
| "questionText": "A robot arm must assemble objects and avoid collisions. Which learning strategy helps?", | |
| "options": [ | |
| "Random movements only", | |
| "Ignoring collisions", | |
| "Combining reward penalties for collisions with positive rewards for assembly", | |
| "Reward only at end of task" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Carefully designed rewards for positive and negative outcomes guide the agent toward safe and efficient behavior." | |
| }, | |
| { | |
| "id": 87, | |
| "questionText": "In reinforcement learning, what is the primary difficulty in environments with delayed rewards?", | |
| "options": [ | |
| "Terminal states are not defined", | |
| "Actions are deterministic only", | |
| "States are fully observable", | |
| "Credit assignment problem (figuring out which actions caused the reward)" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The agent must determine which past actions contributed to the reward, which is the credit assignment problem." | |
| }, | |
| { | |
| "id": 88, | |
| "questionText": "Which technique helps an agent handle large continuous action spaces?", | |
| "options": [ | |
| "Ignoring actions", | |
| "Tabular Q-learning only", | |
| "Function approximation or actor-critic methods", | |
| "Random exploration without guidance" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Function approximation methods allow the agent to represent policies and value functions efficiently for continuous spaces." | |
| }, | |
| { | |
| "id": 89, | |
| "questionText": "An agent receives multiple objectives (speed, energy efficiency, safety). Which RL approach is suitable?", | |
| "options": [ | |
| "Terminal states only", | |
| "Ignoring rewards", | |
| "Single-objective Q-learning only", | |
| "Multi-objective reinforcement learning" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Multi-objective RL allows the agent to optimize across several competing criteria simultaneously." | |
| }, | |
| { | |
| "id": 90, | |
| "questionText": "Which scenario exemplifies an agent learning in a non-stationary environment?", | |
| "options": [ | |
| "Chess with fixed rules", | |
| "A fixed maze with stationary obstacles", | |
| "A deterministic robot path", | |
| "Stock market prices changing over time" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Non-stationary environments change over time, requiring the agent to adapt continuously." | |
| }, | |
| { | |
| "id": 91, | |
| "questionText": "In a warehouse robot problem, how can partial observability be mitigated?", | |
| "options": [ | |
| "Using sensors, memory, or belief states", | |
| "Terminal state modifications", | |
| "Ignoring hidden objects", | |
| "Reward shaping only" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "Advanced sensors and state estimation techniques help infer hidden or occluded parts of the environment." | |
| }, | |
| { | |
| "id": 92, | |
| "questionText": "An agent must choose actions under uncertainty. Which property of the RL model is critical?", | |
| "options": [ | |
| "Terminal states only", | |
| "Fixed policy only", | |
| "Deterministic rewards only", | |
| "Transition probabilities (stochastic model)" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Knowing or estimating transition probabilities is essential when dealing with uncertainty in the environment." | |
| }, | |
| { | |
| "id": 93, | |
| "questionText": "In a robot soccer game, which component captures positions, velocities, and ball possession?", | |
| "options": [ | |
| "Reward", | |
| "Action", | |
| "Policy", | |
| "State" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "The state must represent all relevant information needed to make decisions." | |
| }, | |
| { | |
| "id": 94, | |
| "questionText": "Which scenario demonstrates the credit assignment problem?", | |
| "options": [ | |
| "Terminal state immediately reached", | |
| "Agent receives a reward at the end of a long sequence of actions", | |
| "Deterministic transitions only", | |
| "Reward at every step" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "When rewards are delayed, the agent must determine which earlier actions were responsible." | |
| }, | |
| { | |
| "id": 95, | |
| "questionText": "A reinforcement learning agent uses both exploration and exploitation. Which method represents this?", | |
| "options": [ | |
| "Terminal state selection", | |
| "Random actions only", | |
| "Deterministic policy only", | |
| "ε-greedy policy" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "An ε-greedy policy allows the agent to explore with probability ε and exploit the best-known action otherwise." | |
| }, | |
| { | |
| "id": 96, | |
| "questionText": "An agent must operate in real-time with changing conditions. Which design consideration is essential?", | |
| "options": [ | |
| "Delayed reward only", | |
| "Fixed value function only", | |
| "Adaptive policy updates", | |
| "Ignoring stochasticity" | |
| ], | |
| "correctAnswerIndex": 2, | |
| "explanation": "Real-time adaptation is necessary for performance in dynamic, changing environments." | |
| }, | |
| { | |
| "id": 97, | |
| "questionText": "A robot navigating in fog receives uncertain sensor readings. Which approach is appropriate?", | |
| "options": [ | |
| "Terminal rewards only", | |
| "Use belief states or filtering methods to estimate the true state", | |
| "Use deterministic policy only", | |
| "Ignore sensor noise" | |
| ], | |
| "correctAnswerIndex": 1, | |
| "explanation": "Filtering techniques help the agent estimate the true state from noisy or partial observations." | |
| }, | |
| { | |
| "id": 98, | |
| "questionText": "An agent must balance speed and energy usage in a task. Which RL concept helps design its behavior?", | |
| "options": [ | |
| "State space reduction only", | |
| "Random actions only", | |
| "Terminal states only", | |
| "Reward function that combines multiple objectives" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Designing a reward function that incorporates multiple objectives guides the agent to optimal trade-offs." | |
| }, | |
| { | |
| "id": 99, | |
| "questionText": "Which situation requires an agent to maintain memory of previous states?", | |
| "options": [ | |
| "Terminal states only", | |
| "Deterministic rewards only", | |
| "Fully observable deterministic environments", | |
| "Partially observable environments" | |
| ], | |
| "correctAnswerIndex": 3, | |
| "explanation": "Memory or belief states are needed when the agent cannot directly observe the full environment." | |
| }, | |
| { | |
| "id": 100, | |
| "questionText": "A reinforcement learning agent must adapt policies as other agents change strategies. Which environment type is this?", | |
| "options": [ | |
| "Multi-agent and non-stationary", | |
| "Terminal state only", | |
| "Single-agent deterministic", | |
| "Episodic only" | |
| ], | |
| "correctAnswerIndex": 0, | |
| "explanation": "The agent interacts with other adaptive agents, making the environment non-stationary and multi-agent." | |
| } | |
| ] | |
| } | |