File size: 45,032 Bytes
0d00d62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 |
{
"title": "Reinforcement Learning Policy Mastery: 100 MCQs",
"description": "A comprehensive set of 100 multiple-choice questions covering RL Policies, including actions, policy types (deterministic, stochastic, greedy, ε-greedy), optimization, exploration vs. exploitation, and real-world scenarios.",
"questions": [
{
"id": 1,
"questionText": "In reinforcement learning, what is an action?",
"options": [
"A fixed rule of the environment",
"A numerical reward signal",
"A choice made by the agent to interact with the environment",
"Information received from the environment"
],
"correctAnswerIndex": 2,
"explanation": "An action is the decision taken by the agent to affect the environment."
},
{
"id": 2,
"questionText": "What is a policy in reinforcement learning?",
"options": [
"A mapping from states to actions",
"A sequence of states",
"The transition probability",
"The total reward received"
],
"correctAnswerIndex": 0,
"explanation": "A policy is a strategy that decides which action to take in each state."
},
{
"id": 3,
"questionText": "A policy is denoted mathematically as:",
"options": [
"γ",
"R(s)",
"π(a|s)",
"P(s’|s,a)"
],
"correctAnswerIndex": 2,
"explanation": "π(a|s) defines the probability of taking action a when in state s."
},
{
"id": 4,
"questionText": "A deterministic policy always:",
"options": [
"Selects the same action for a given state",
"Depends only on reward",
"Selects random actions",
"Changes over time"
],
"correctAnswerIndex": 0,
"explanation": "A deterministic policy selects the same action every time for a specific state."
},
{
"id": 5,
"questionText": "A stochastic policy:",
"options": [
"Does not involve randomness",
"Ignores the current state",
"Gives a probability distribution over actions",
"Always selects the same action"
],
"correctAnswerIndex": 2,
"explanation": "A stochastic policy gives probabilities for each possible action."
},
{
"id": 6,
"questionText": "Which of the following represents an action set?",
"options": [
"History of transitions",
"All possible states in the environment",
"All rewards received",
"All possible decisions the agent can make at a state"
],
"correctAnswerIndex": 3,
"explanation": "The action set is the list of all actions the agent can choose."
},
{
"id": 7,
"questionText": "A policy helps the agent to:",
"options": [
"Modify the environment",
"Decide which action to take",
"Terminate learning immediately",
"Change the reward function"
],
"correctAnswerIndex": 1,
"explanation": "A policy directly guides the agent’s decisions at each state."
},
{
"id": 8,
"questionText": "An optimal policy aims to:",
"options": [
"Only get immediate rewards",
"Avoid terminal states",
"Maximize long-term cumulative reward",
"Minimize actions taken"
],
"correctAnswerIndex": 2,
"explanation": "Optimal policies aim to maximize cumulative return, not just immediate rewards."
},
{
"id": 9,
"questionText": "Which symbol is commonly used for policy?",
"options": [
"λ (lambda)",
"δ (delta)",
"π (pi)",
"μ (mu)"
],
"correctAnswerIndex": 2,
"explanation": "Policy is conventionally represented as π in reinforcement learning literature."
},
{
"id": 10,
"questionText": "In reinforcement learning, actions are chosen based on:",
"options": [
"Fixed environment rules",
"The current policy",
"Random guessing",
"The reward only"
],
"correctAnswerIndex": 1,
"explanation": "The agent follows its policy to choose an action in each state."
},
{
"id": 11,
"questionText": "A policy π(a|s) defines:",
"options": [
"The reward received after an action",
"The probability of taking action a in state s",
"The discount factor",
"The next state transition"
],
"correctAnswerIndex": 1,
"explanation": "π(a|s) gives the probability that the agent selects action a when it is in state s."
},
{
"id": 12,
"questionText": "Which type of policy is commonly used during exploration?",
"options": [
"Deterministic policy",
"Static policy",
"Stochastic policy",
"Greedy policy"
],
"correctAnswerIndex": 2,
"explanation": "Stochastic policies allow randomness and help the agent to explore better."
},
{
"id": 13,
"questionText": "What does a greedy policy do?",
"options": [
"Always selects the action with the highest estimated value",
"Selects random actions",
"Avoids high rewards",
"Maximizes long-term regret"
],
"correctAnswerIndex": 0,
"explanation": "A greedy policy always selects the action currently believed to give the highest reward."
},
{
"id": 14,
"questionText": "The ε-greedy policy:",
"options": [
"Always exploits the best action",
"Balances exploration and exploitation",
"Never selects random actions",
"Explores only"
],
"correctAnswerIndex": 1,
"explanation": "ε-greedy policy chooses the best action most of the time, but sometimes explores randomly."
},
{
"id": 15,
"questionText": "A policy directly controls:",
"options": [
"How rewards are given",
"The discount factor",
"Which action the agent selects",
"The transition probability"
],
"correctAnswerIndex": 2,
"explanation": "The policy is responsible for selecting which action to take in each state."
},
{
"id": 16,
"questionText": "Which of the following is true for a deterministic policy?",
"options": [
"It randomly selects an action",
"It maps each state to exactly one action",
"It gives probabilities for actions",
"It has no control over actions"
],
"correctAnswerIndex": 1,
"explanation": "Deterministic policy chooses exactly one action for a given state."
},
{
"id": 17,
"questionText": "In reinforcement learning, the agent follows a policy to:",
"options": [
"Generate rewards",
"Stop learning",
"Define terminal states",
"Decide actions"
],
"correctAnswerIndex": 3,
"explanation": "The policy guides the agent in selecting actions during interaction."
},
{
"id": 18,
"questionText": "A policy that selects actions purely based on current reward and ignores future rewards is:",
"options": [
"Myopic",
"Long-term",
"Optimal",
"Model-based"
],
"correctAnswerIndex": 0,
"explanation": "A myopic policy only focuses on immediate rewards and not the future."
},
{
"id": 19,
"questionText": "Which of the following best describes the role of a policy?",
"options": [
"It stores previous rewards",
"It defines the agent’s behavior strategy",
"It calculates final reward only",
"It controls environment dynamics"
],
"correctAnswerIndex": 1,
"explanation": "A policy is the strategy that the agent uses to decide what action to take."
},
{
"id": 20,
"questionText": "What is the goal of policy optimization?",
"options": [
"To eliminate all randomness",
"To improve the policy for higher long-term rewards",
"To reduce the number of states",
"To modify the reward function"
],
"correctAnswerIndex": 1,
"explanation": "Policy optimization aims to improve decision-making for maximum cumulative reward."
},
{
"id": 21,
"questionText": "Which type of policy is preferred during exploitation?",
"options": [
"Exploratory",
"Myopic",
"Fully random",
"Deterministic"
],
"correctAnswerIndex": 3,
"explanation": "Exploitation prefers deterministic behavior for consistent maximum return."
},
{
"id": 22,
"questionText": "What does ε represent in ε-greedy policy?",
"options": [
"Learning rate",
"Probability of exploration",
"State transition cost",
"Discount factor"
],
"correctAnswerIndex": 1,
"explanation": "ε controls how often the agent explores randomly instead of exploiting."
},
{
"id": 23,
"questionText": "A policy that changes and improves over time is called:",
"options": [
"Fixed policy",
"Adaptive policy",
"Terminal policy",
"Static policy"
],
"correctAnswerIndex": 1,
"explanation": "Adaptive policies evolve with learning and improve performance over time."
},
{
"id": 24,
"questionText": "Which algorithm often improves policy iteratively?",
"options": [
"Clustering",
"Bubble sort",
"Policy gradient",
"Binary search"
],
"correctAnswerIndex": 2,
"explanation": "Policy gradient is a reinforcement learning method specifically designed to optimize policies."
},
{
"id": 25,
"questionText": "The policy essentially forms the agent’s:",
"options": [
"Memory buffer",
"Reward system",
"Behavior strategy",
"Environment model"
],
"correctAnswerIndex": 2,
"explanation": "The policy is the behavior strategy used to choose actions."
},
{
"id": 26,
"questionText": "Which policy guarantees the same output for each particular input state?",
"options": [
"Random policy",
"Stochastic policy",
"Deterministic policy",
"Exploration-only policy"
],
"correctAnswerIndex": 2,
"explanation": "Deterministic policies always map one state to one fixed action."
},
{
"id": 27,
"questionText": "What is the output of a policy?",
"options": [
"A future state",
"A value estimate",
"A reward signal",
"An action selection"
],
"correctAnswerIndex": 3,
"explanation": "The policy outputs an action based on the current state."
},
{
"id": 28,
"questionText": "Which of the following is true?",
"options": [
"Discount factor selects actions",
"Environment follows the agent’s rules",
"Policy decides the action; environment gives the outcome",
"Policy gives reward directly"
],
"correctAnswerIndex": 2,
"explanation": "Policy → action. Environment → next state and reward."
},
{
"id": 29,
"questionText": "Which kind of policy is more flexible and good for exploration?",
"options": [
"Static policy",
"Deterministic policy",
"History-free policy",
"Stochastic policy"
],
"correctAnswerIndex": 3,
"explanation": "Stochastic policies are better for discovering new strategies via randomness."
},
{
"id": 30,
"questionText": "A policy that does not change over time is called:",
"options": [
"Optimal policy",
"Static policy",
"Gradient policy",
"Adaptive policy"
],
"correctAnswerIndex": 1,
"explanation": "A static policy remains fixed throughout the learning process."
},
{
"id": 31,
"questionText": "Which policy type is best suited for environments with high uncertainty or partially observable states?",
"options": [
"Stochastic policy",
"Deterministic policy",
"Static policy",
"Greedy policy"
],
"correctAnswerIndex": 0,
"explanation": "Stochastic policies allow exploration and handle uncertainty better by assigning probabilities to actions."
},
{
"id": 32,
"questionText": "In reinforcement learning, what does the policy π directly depend on?",
"options": [
"Past rewards only",
"Current state",
"Network latency",
"Future rewards only"
],
"correctAnswerIndex": 1,
"explanation": "Policy π(a|s) maps the current state s to an action or probability distribution over actions."
},
{
"id": 33,
"questionText": "A greedy policy always:",
"options": [
"Avoids previously selected actions",
"Explores equally among all actions",
"Chooses the action with highest estimated value",
"Chooses randomly"
],
"correctAnswerIndex": 2,
"explanation": "Greedy policy exploits knowledge and always selects the best-known action."
},
{
"id": 34,
"questionText": "What happens if a policy focuses only on short-term rewards?",
"options": [
"It becomes myopic and may miss long-term gains",
"It guarantees optimal performance",
"It never learns from mistakes",
"It requires no environment feedback"
],
"correctAnswerIndex": 0,
"explanation": "A short-sighted policy (myopic) becomes greedy and may miss high future rewards."
},
{
"id": 35,
"questionText": "Which policy ensures a fixed mapping from state to action but does NOT involve randomness?",
"options": [
"Stochastic policy",
"Deterministic policy",
"Exploratory policy",
"Random policy"
],
"correctAnswerIndex": 1,
"explanation": "Deterministic policy gives the same action every time for the same state."
},
{
"id": 36,
"questionText": "An ε-greedy policy will choose a random action with probability:",
"options": [
"1 − ε",
"ε",
"0",
"Always"
],
"correctAnswerIndex": 1,
"explanation": "ε controls exploration; with probability ε, a random action is selected."
},
{
"id": 37,
"questionText": "What is the purpose of policy improvement?",
"options": [
"To reduce the number of actions",
"To make the policy produce better long-term rewards",
"To disable exploration",
"To eliminate randomness"
],
"correctAnswerIndex": 1,
"explanation": "Policy improvement updates the agent’s strategy toward higher future returns."
},
{
"id": 38,
"questionText": "Which type of policy selects an action with a probability proportional to its value?",
"options": [
"Static policy",
"Deterministic policy",
"Greedy policy",
"Softmax policy"
],
"correctAnswerIndex": 3,
"explanation": "Softmax policies assign action probabilities based on exponentiated value estimates."
},
{
"id": 39,
"questionText": "In reinforcement learning, a policy defines:",
"options": [
"The environment transitions",
"The discount factor",
"The reward assignment logic",
"The rule for choosing actions"
],
"correctAnswerIndex": 3,
"explanation": "A policy completely governs how an agent selects actions in each state."
},
{
"id": 40,
"questionText": "If a policy never explores new actions, it may:",
"options": [
"Increase randomness over time",
"Get stuck in a suboptimal behavior",
"Automatically find the best policy",
"Reset the environment state"
],
"correctAnswerIndex": 1,
"explanation": "Without exploration, the agent may get stuck in local optimum solutions."
},
{
"id": 41,
"questionText": "Which policy approach balances both exploitation and exploration?",
"options": [
"Greedy policy",
"Deterministic policy",
"Static policy",
"ε-greedy policy"
],
"correctAnswerIndex": 3,
"explanation": "ε-greedy selects the best-known action most of the time while still exploring occasionally."
},
{
"id": 42,
"questionText": "Which policy is purely exploitation-based?",
"options": [
"ε-greedy policy",
"Softmax policy",
"Stochastic policy",
"Greedy policy"
],
"correctAnswerIndex": 3,
"explanation": "Greedy policy always selects the current best action and does not explore."
},
{
"id": 43,
"questionText": "A policy that does not change over time is called:",
"options": [
"Adaptive policy",
"Exploratory policy",
"Parameterized policy",
"Fixed or static policy"
],
"correctAnswerIndex": 3,
"explanation": "Static policies remain unchanged during training."
},
{
"id": 44,
"questionText": "Which type of policy is optimized directly in Policy Gradient methods?",
"options": [
"Static policy",
"Stochastic policy",
"Rule-based policy",
"Greedy policy"
],
"correctAnswerIndex": 1,
"explanation": "Policy gradient algorithms operate directly on stochastic parameterized policies."
},
{
"id": 45,
"questionText": "What is the core objective of optimal policy?",
"options": [
"To reduce number of actions",
"To minimize time spent in each state",
"To maximize cumulative long-term reward",
"To avoid terminal states"
],
"correctAnswerIndex": 2,
"explanation": "The goal of optimal policy is to maximize expected discounted return over time."
},
{
"id": 46,
"questionText": "Which policy type outputs a probability distribution over actions?",
"options": [
"Static policy",
"Deterministic policy",
"Stochastic policy",
"Greedy policy"
],
"correctAnswerIndex": 2,
"explanation": "Stochastic policies output action probabilities rather than a fixed action."
},
{
"id": 47,
"questionText": "Which situation requires stochastic policies over deterministic ones?",
"options": [
"When multiple actions have equal rewards",
"Reward is guaranteed every step",
"No randomness is allowed",
"Agent already knows full environment"
],
"correctAnswerIndex": 0,
"explanation": "Stochastic policies prevent bias when actions are equally good."
},
{
"id": 48,
"questionText": "Which policy ensures full exploration but no exploitation?",
"options": [
"Deterministic policy",
"Greedy policy",
"Softmax policy",
"Random policy"
],
"correctAnswerIndex": 3,
"explanation": "Random policy entirely explores without considering reward values."
},
{
"id": 49,
"questionText": "The policy improvement step checks:",
"options": [
"Only immediate rewards",
"If taking a better action increases long-term reward",
"If randomness is eliminated",
"The size of the replay buffer"
],
"correctAnswerIndex": 1,
"explanation": "Policy improvement ensures updated actions lead to better expected return."
},
{
"id": 50,
"questionText": "What happens if ε in ε-greedy policy is too high?",
"options": [
"All rewards become zero",
"Policy becomes deterministic",
"Agent explores too much and learns slowly",
"Agent gets stuck in exploitation"
],
"correctAnswerIndex": 2,
"explanation": "Too high ε causes too much random exploration and slower learning."
},
{
"id": 51,
"questionText": "What is the main disadvantage of a purely greedy policy?",
"options": [
"It explores too much",
"It may get stuck in a local optimum",
"It ignores rewards completely",
"It requires deep models only"
],
"correctAnswerIndex": 1,
"explanation": "Greedy policy exploits only current best action and may miss higher long-term rewards."
},
{
"id": 52,
"questionText": "Which policy is essential for exploration-exploitation balance in RL?",
"options": [
"ε-greedy policy",
"Greedy policy",
"Random policy",
"Static policy"
],
"correctAnswerIndex": 0,
"explanation": "ε-greedy smartly balances exploitation with occasional exploration."
},
{
"id": 53,
"questionText": "Why are stochastic policies preferred over deterministic ones in partially observable environments?",
"options": [
"They prevent overfitting to rewards.",
"They improve exploration and robustness to uncertainty.",
"They guarantee immediate reward.",
"They reduce computation time."
],
"correctAnswerIndex": 1,
"explanation": "Stochastic policies help deal with uncertainty and enable natural exploration in partially observable settings."
},
{
"id": 54,
"questionText": "Policy π(a|s) in RL typically represents:",
"options": [
"Transition from state s to s'",
"Value of the state",
"Sum of discounted rewards",
"Probability of selecting action a at state s"
],
"correctAnswerIndex": 3,
"explanation": "π(a|s) expresses how likely the agent is to take action a in state s."
},
{
"id": 55,
"questionText": "What happens if ε is set to 0 in ε-greedy policy?",
"options": [
"It stops learning",
"It becomes fully random",
"The policy becomes fully greedy",
"It becomes softmax-based"
],
"correctAnswerIndex": 2,
"explanation": "ε = 0 means no exploration — pure greedy exploitation."
},
{
"id": 56,
"questionText": "Softmax policy chooses actions based on:",
"options": [
"Normalized exponentiated action values",
"Fixed priority order",
"Pure randomness",
"Greedy selection only"
],
"correctAnswerIndex": 0,
"explanation": "Softmax uses exponentiated Q-values and applies probability distribution."
},
{
"id": 57,
"questionText": "A well-optimized policy should be:",
"options": [
"Stable and yield maximum cumulative reward",
"Constantly changing even after convergence",
"Ignoring future rewards",
"Completely random"
],
"correctAnswerIndex": 0,
"explanation": "An ideal policy should maximize cumulative return with stable decision-making."
},
{
"id": 58,
"questionText": "Which policy is more suitable for early training?",
"options": [
"Deterministic policy",
"Greedy policy",
"Static policy",
"Stochastic or ε-greedy policy"
],
"correctAnswerIndex": 3,
"explanation": "Stochastic or ε-greedy policies help explore environment during learning phase."
},
{
"id": 59,
"questionText": "A deterministic policy π(s) returns:",
"options": [
"A random action every time",
"A probability distribution",
"A single fixed action for state s",
"The immediate reward"
],
"correctAnswerIndex": 2,
"explanation": "Deterministic policy gives one specific action for each state."
},
{
"id": 60,
"questionText": "What problem does the ε-greedy policy try to solve?",
"options": [
"Reward scaling issue",
"Learning rate adjustment",
"Exploration vs exploitation dilemma",
"Model overfitting"
],
"correctAnswerIndex": 2,
"explanation": "ε-greedy gives balance between trying new actions and exploiting known good ones."
},
{
"id": 61,
"questionText": "A policy that adapts and improves as learning progresses is called:",
"options": [
"Random policy",
"Static policy",
"Adaptive policy",
"Greedy policy"
],
"correctAnswerIndex": 2,
"explanation": "Adaptive policy updates itself over time for better performance."
},
{
"id": 62,
"questionText": "If ε is too small in ε-greedy policy:",
"options": [
"It stops acting on states",
"Policy becomes unstable",
"Agent keeps taking random actions",
"The agent may not explore enough"
],
"correctAnswerIndex": 3,
"explanation": "Too small ε leads to very less exploration — causing local optimum trap."
},
{
"id": 63,
"questionText": "Which policy guarantees the best long-term reward theoretically?",
"options": [
"Static policy",
"Random policy",
"Greedy policy",
"Optimal policy"
],
"correctAnswerIndex": 3,
"explanation": "The optimal policy is the mathematically best-performing policy."
},
{
"id": 64,
"questionText": "Policy π is said to converge when:",
"options": [
"Environment resets",
"Rewards become zero",
"Further updates do not improve its long-term return",
"It stops selecting actions"
],
"correctAnswerIndex": 2,
"explanation": "Convergence means policy has reached optimality and no longer changes significantly."
},
{
"id": 65,
"questionText": "Which policy is best when actions are continuous, like turning a steering wheel?",
"options": [
"Static policy",
"Greedy-only policy",
"Stochastic policy",
"Greedy policy"
],
"correctAnswerIndex": 2,
"explanation": "Stochastic policies can represent probability distributions over continuous action values."
},
{
"id": 66,
"questionText": "Which policy type is commonly used in deep policy gradient methods?",
"options": [
"Parameterized stochastic policy",
"Static deterministic policy",
"Greedy-only policy",
"Pure random policy"
],
"correctAnswerIndex": 0,
"explanation": "Policy gradient RL learns continuous parameterized stochastic policies."
},
{
"id": 67,
"questionText": "What is the goal of an optimal policy π*?",
"options": [
"To maximize the expected cumulative future reward",
"To ensure equal action probability",
"To ignore future outcomes",
"To minimize state visitation"
],
"correctAnswerIndex": 0,
"explanation": "Optimal policy focuses only on maximizing long-term discounted returns."
},
{
"id": 68,
"questionText": "Which policy ensures highest theoretical performance but may be hard to compute?",
"options": [
"Optimal policy π*",
"Static policy",
"Fixed greedy policy",
"Random policy"
],
"correctAnswerIndex": 0,
"explanation": "Optimal policy π* is mathematically best but hard to derive in complex environments."
},
{
"id": 69,
"questionText": "When does ε decay in ε-greedy policy?",
"options": [
"At the start of training only",
"As training progresses to reduce randomness",
"Only when reward is negative",
"When the environment resets"
],
"correctAnswerIndex": 1,
"explanation": "Exploration reduces over time by decaying ε — leading to more exploitation later."
},
{
"id": 70,
"questionText": "Which policy is most suitable for a fully trained agent ready for deployment?",
"options": [
"Random policy",
"High ε policy",
"Deterministic or greedy policy",
"Static equal probability policy"
],
"correctAnswerIndex": 2,
"explanation": "Once fully trained, deterministic greedy policies are ideal for stable deployment."
},
{
"id": 71,
"questionText": "In a self-driving car RL system, a 'policy' is best described as:",
"options": [
"A learned probability distribution over actions given states.",
"A list of manually coded safety instructions.",
"A fixed rule for mapping visual input to steering angle.",
"A memory of all past successful trajectories."
],
"correctAnswerIndex": 0,
"explanation": "A policy defines the agent’s behavior as a probability distribution of choosing actions conditioned on the current state."
},
{
"id": 72,
"questionText": "Why are stochastic policies often preferred over deterministic ones in partially observable environments?",
"options": [
"They guarantee immediate reward.",
"They prevent overfitting to rewards.",
"They reduce computation time.",
"They improve exploration and robustness to uncertainty."
],
"correctAnswerIndex": 3,
"explanation": "Stochastic policies help deal with uncertainty and enable natural exploration in partially observable settings."
},
{
"id": 73,
"questionText": "A policy π(a|s) outputs 0.9 for action A and 0.1 for action B. What does this imply?",
"options": [
"Action A is always selected.",
"Both actions have equal probability.",
"Action B is always selected.",
"Action A has higher likelihood of being chosen."
],
"correctAnswerIndex": 3,
"explanation": "The policy indicates the relative preference for taking actions based on their probabilities."
},
{
"id": 74,
"questionText": "In policy-based RL algorithms, what is optimized directly?",
"options": [
"The policy parameters.",
"The value function.",
"The reward function.",
"The transition probability of environment states."
],
"correctAnswerIndex": 0,
"explanation": "Policy-based methods optimize the policy parameters directly to maximize expected reward."
},
{
"id": 75,
"questionText": "Which statement is TRUE about greedy policies?",
"options": [
"They always pick the action with highest estimated value.",
"They ignore Q-values completely.",
"They randomly choose between top two actions.",
"They always balance exploration and exploitation."
],
"correctAnswerIndex": 0,
"explanation": "A greedy policy always chooses the action with maximum current estimated value."
},
{
"id": 76,
"questionText": "A trading bot based on RL keeps choosing the same profitable action and misses future better opportunities. Which failure is happening?",
"options": [
"Exploration overflow",
"Reward hacking",
"State corruption",
"Policy collapse due to over-exploitation"
],
"correctAnswerIndex": 3,
"explanation": "Over-exploitation leads to policy collapse where the agent gets stuck in a suboptimal repeated action."
},
{
"id": 77,
"questionText": "In a policy π(a|s), what does 'temperature' parameter control in Softmax-based action selection?",
"options": [
"Memory retention rate",
"Discount factor",
"Learning rate of the critic",
"Exploration randomness"
],
"correctAnswerIndex": 3,
"explanation": "Temperature controls randomness — higher temperature means more exploration."
},
{
"id": 78,
"questionText": "Why is a policy gradient method preferred in continuous-action environments like robotic arm movement?",
"options": [
"It avoids function approximation.",
"It doesn't need neural networks.",
"It directly outputs continuous actions.",
"It requires no reward function."
],
"correctAnswerIndex": 2,
"explanation": "Policy gradient methods can directly learn probability over continuous action spaces."
},
{
"id": 79,
"questionText": "A policy outputs nearly equal probabilities for multiple actions at late-stage learning. What does this suggest?",
"options": [
"Perfect optimal policy",
"Overfitting to previous states",
"High confidence in one action",
"Underfitting or failure to converge"
],
"correctAnswerIndex": 3,
"explanation": "Equal probabilities late in training indicates the agent has not converged or mastered action ranking."
},
{
"id": 80,
"questionText": "Which is a danger of purely deterministic policies when facing adversarial agents?",
"options": [
"Faster convergence",
"Guaranteed optimal performance",
"Improved reward stability",
"Predictability and easy exploitation"
],
"correctAnswerIndex": 3,
"explanation": "Deterministic policies can be exploited by adversarial agents due to full predictability."
},
{
"id": 81,
"questionText": "In a warehouse robotic RL system, what does a well-designed policy ensure during real-time operation?",
"options": [
"Maximum randomness in each movement",
"Only reward-maximizing actions without safety checks",
"Fixed pre-programmed path regardless of obstacles",
"Consistent action-output given state while adapting to dynamics"
],
"correctAnswerIndex": 3,
"explanation": "A real-world policy must be consistent yet adaptive — fully deterministic is dangerous, fully random is useless."
},
{
"id": 82,
"questionText": "A policy becomes overconfident in one action, ignoring safer alternatives. This is known as:",
"options": [
"Reward normalization failure",
"Policy entropy collapse",
"Temporal instability",
"Exploration dilation"
],
"correctAnswerIndex": 1,
"explanation": "Low entropy implies the policy is stuck in one decision, risking catastrophic exploitation."
},
{
"id": 83,
"questionText": "Why do autonomous drone RL policies avoid deterministic-only action selection?",
"options": [
"It violates Bellman Optimality equation",
"Determinism increases predictability against wind/adversaries",
"Neural networks cannot output deterministic values",
"Computational cost becomes infinite"
],
"correctAnswerIndex": 1,
"explanation": "Adversarial weather or agents can exploit predictable actions — randomness protects against that."
},
{
"id": 84,
"questionText": "A policy π outputs low probability for rare emergency actions. What is the RISK?",
"options": [
"Higher computational cost",
"Over-generalization",
"Slower training convergence",
"Fatal response delay in critical states"
],
"correctAnswerIndex": 3,
"explanation": "Suppressing rare but critical safety actions can cause fatal real-time failures."
},
{
"id": 85,
"questionText": "What does entropy regularization achieve in modern policy optimization?",
"options": [
"Sets constant reward for all actions",
"Forces agent to become deterministic",
"Removes exploration entirely",
"Discourages policy from overfitting to high-reward actions early"
],
"correctAnswerIndex": 3,
"explanation": "Entropy regularization encourages exploration and prevents collapsing into premature deterministic behavior."
},
{
"id": 86,
"questionText": "In large-scale financial RL trading, too stochastic a policy may cause:",
"options": [
"Zero exploration",
"Excessive random trading losses",
"Complete policy collapse",
"High overfitting to past trades"
],
"correctAnswerIndex": 1,
"explanation": "Too much randomness causes excessive blind risk-taking in finance — balanced policy entropy is key."
},
{
"id": 87,
"questionText": "Policy π(a|s) shifts radically between actions on each step even in identical states. This indicates:",
"options": [
"Optimal deterministic convergence",
"Exploration success",
"Severe policy instability or oscillation",
"Healthy dynamic learning"
],
"correctAnswerIndex": 2,
"explanation": "Wildly changing action selection signals unstable or oscillating policy — unsafe in real-world RL."
},
{
"id": 88,
"questionText": "An RL policy in medical treatment planning must prioritize which property MOST?",
"options": [
"Maximum exploration",
"Random therapy attempts",
"Full reward-maximization regardless of side-effects",
"Predictable and safe deterministic bias with edge-case backup"
],
"correctAnswerIndex": 3,
"explanation": "Medical RL systems must prioritize safety + controlled determinism with controlled fallback."
},
{
"id": 89,
"questionText": "Why do adversarial training environments use stochastic policies intentionally?",
"options": [
"To make agent behavior unpredictable and robust",
"To remove neural network computations",
"To reduce model size",
"To lock policy into single fixed action"
],
"correctAnswerIndex": 0,
"explanation": "Stochasticity reduces exploitability — critical in adversarial environments."
},
{
"id": 90,
"questionText": "A policy is said to be 'generalizable' when it:",
"options": [
"Always explores instead of exploiting",
"Only performs well on its training scenarios",
"Relies on manual rule-based expert tuning",
"Maintains stable performance across unseen states"
],
"correctAnswerIndex": 3,
"explanation": "Generalizable policies can adapt and remain effective even in unseen or shifted state environments."
},
{
"id": 91,
"questionText": "A reinforcement learning agent in an autonomous drone begins prioritizing fuel-saving over obstacle avoidance. What failure is occurring?",
"options": [
"Network underfitting due to low capacity",
"Reward misalignment leading to unsafe policy behavior",
"Over-exploration of random states",
"Incorrect Q-value normalization"
],
"correctAnswerIndex": 1,
"explanation": "Reward misalignment leads the agent to learn dangerous shortcuts that violate safety constraints."
},
{
"id": 92,
"questionText": "Which policy behavior is IDEAL for high-speed stock trading RL agents?",
"options": [
"Completely deterministic for maximum consistency",
"Policy that ignores market shifts",
"Pure random policy",
"Stochastic with controlled risk-awareness and adaptive confidence"
],
"correctAnswerIndex": 3,
"explanation": "A hybrid of deterministic stability and stochastic adaptability is ideal in rapidly shifting markets."
},
{
"id": 93,
"questionText": "In human-in-the-loop medical RL systems, which policy issue is most unethical?",
"options": [
"Occasionally being conservative",
"Learning slower than expected",
"Taking irreversible risky action without confidence or override",
"Being slightly inefficient"
],
"correctAnswerIndex": 2,
"explanation": "Irreversible high-risk actions without confidence or override are ethically forbidden in medical RL."
},
{
"id": 94,
"questionText": "A robot vacuum cleaner learns a policy that avoids cleaning hard-to-reach corners to minimize energy use. What is happening?",
"options": [
"Policy generalization improvement",
"Reward hacking through exploitation loopholes",
"Correct exploration behavior",
"Perfect optimal cleaning policy"
],
"correctAnswerIndex": 1,
"explanation": "Reward hacking — agent optimizes metric while ignoring true task objectives (corner cleaning)."
},
{
"id": 95,
"questionText": "Why is a soft policy update (e.g., Polyak averaging) preferred over a hard update?",
"options": [
"It makes the policy fully deterministic",
"Reduces model size",
"Increases reward immediately",
"Prevents destabilizing sudden policy jumps"
],
"correctAnswerIndex": 3,
"explanation": "Soft updates stabilize learning by gradually blending new policies into existing ones."
},
{
"id": 96,
"questionText": "Which real-world failure BEST explains why RL policies must include fallback or rejection mechanisms?",
"options": [
"Agent faces unseen scenario and takes catastrophic action",
"Reward is slightly lower than expected",
"Agent reaches optimal convergence",
"Robot repeats successful action"
],
"correctAnswerIndex": 0,
"explanation": "Unseen scenario + overconfident wrong action = critical failure → fallback mechanisms are mandatory."
},
{
"id": 97,
"questionText": "A self-driving car RL policy begins intentionally taking risky shortcuts to win racing simulations. What is this problem called?",
"options": [
"Reward hacking / specification gaming",
"State abstraction conflict",
"Policy lag",
"Exploration decay failure"
],
"correctAnswerIndex": 0,
"explanation": "Reward hacking occurs when agent finds loopholes in reward design — optimizing metric but violating goal intentions."
},
{
"id": 98,
"questionText": "A policy must prioritize __________ ABOVE raw reward maximization in life-critical RL applications.",
"options": [
"Hard-coded deterministic action",
"Fastest reward achievement",
"Exploration rate tuning",
"Safety and long-term stability"
],
"correctAnswerIndex": 3,
"explanation": "In life-critical environments, safety & reliability are priority over short-term reward spikes."
},
{
"id": 99,
"questionText": "What is the key reason modern RL policy models use entropy terms during early training?",
"options": [
"Reduce network size",
"Encourage broader exploration to avoid premature policy locking",
"Guarantee immediate reward",
"Force complete determinism"
],
"correctAnswerIndex": 1,
"explanation": "Entropy prevents collapse into suboptimal deterministic policy too early during training."
},
{
"id": 100,
"questionText": "In a policy deployment for military RL drones, what is considered NON-NEGOTIABLE?",
"options": [
"Maximum exploration for new tactics",
"Full removal of human supervision",
"Absolute controllability and override priority over agent autonomy",
"Allowing autonomous lethal decisions"
],
"correctAnswerIndex": 2,
"explanation": "Override authority and controllability are mandatory — autonomy is allowed but never absolute in critical RL systems."
}
]
}
|