File size: 48,204 Bytes
0d00d62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 |
{
"title": "Deep Reinforcement Learning Mastery: 100 MCQs",
"description": "A comprehensive set of 100 multiple-choice questions to test and deepen your understanding of Deep Reinforcement Learning, from basic concepts to advanced topics like Deep Q-Networks, Policy Gradients, and Actor-Critic methods.",
"questions": [
{
"id": 1,
"questionText": "What is the main difference between standard Q-Learning and Deep Q-Learning?",
"options": [
"Q-Learning ignores rewards",
"Deep Q-Learning uses a neural network to approximate Q-values",
"Q-Learning uses continuous actions",
"Deep Q-Learning requires supervised labels"
],
"correctAnswerIndex": 1,
"explanation": "Deep Q-Learning replaces the Q-table with a neural network to handle large or continuous state spaces."
},
{
"id": 2,
"questionText": "Which problem does function approximation in Deep Reinforcement Learning solve?",
"options": [
"Exploration vs. exploitation",
"Handling large or continuous state spaces",
"Reward shaping",
"Reducing discount factor"
],
"correctAnswerIndex": 1,
"explanation": "Function approximation allows the agent to generalize Q-values across many states instead of storing a table."
},
{
"id": 3,
"questionText": "In Deep Q-Networks (DQN), what is the purpose of the target network?",
"options": [
"Generate random actions",
"Provide rewards",
"Stabilize learning by providing fixed Q-value targets periodically",
"Replace policy network completely"
],
"correctAnswerIndex": 2,
"explanation": "The target network reduces oscillations by keeping Q-value targets fixed for several updates before copying from the main network."
},
{
"id": 4,
"questionText": "What is experience replay in DRL?",
"options": [
"Resetting the environment",
"Storing past experiences and sampling randomly for training",
"Recording rewards only",
"Using supervised labels"
],
"correctAnswerIndex": 1,
"explanation": "Experience replay breaks correlation between sequential data and improves learning stability by training on randomly sampled past experiences."
},
{
"id": 5,
"questionText": "Which activation function is commonly used in DRL networks?",
"options": [
"Linear only",
"ReLU",
"Tanh only",
"Sigmoid only"
],
"correctAnswerIndex": 1,
"explanation": "ReLU is commonly used due to its efficiency and ability to reduce vanishing gradient problems."
},
{
"id": 6,
"questionText": "In DRL, what is the role of the discount factor γ?",
"options": [
"Control neural network learning rate",
"Weigh future rewards relative to immediate rewards",
"Scale input features",
"Select actions randomly"
],
"correctAnswerIndex": 1,
"explanation": "The discount factor balances the importance of immediate versus future rewards in value estimation."
},
{
"id": 7,
"questionText": "Why is Q-learning considered off-policy?",
"options": [
"It uses supervised labels",
"It updates values using the best next action, not necessarily the action taken",
"It follows a fixed policy only",
"It ignores rewards"
],
"correctAnswerIndex": 1,
"explanation": "Off-policy learning uses the optimal action for updating Q-values regardless of the agent’s actual behavior policy."
},
{
"id": 8,
"questionText": "Which problem does target network in DQN help to mitigate?",
"options": [
"Exploration failure",
"Instability due to moving Q-value targets",
"Sparse rewards",
"Reward hacking"
],
"correctAnswerIndex": 1,
"explanation": "Fixing Q-value targets for several steps reduces oscillations and divergence in neural network training."
},
{
"id": 9,
"questionText": "In DRL, why is experience replay preferred over online updates?",
"options": [
"Requires supervised data",
"Reduces correlation between consecutive samples",
"Only works with deterministic environments",
"Avoids using discount factor"
],
"correctAnswerIndex": 1,
"explanation": "Sampling random experiences helps the network learn more effectively from diverse states and actions."
},
{
"id": 10,
"questionText": "What is a primary advantage of Deep Q-Networks over tabular Q-Learning?",
"options": [
"Eliminates exploration",
"Removes the need for discount factor",
"Can handle high-dimensional or continuous state spaces",
"Requires fewer rewards"
],
"correctAnswerIndex": 2,
"explanation": "DQN can generalize across large state spaces using neural networks instead of storing Q-values in a table."
},
{
"id": 11,
"questionText": "Which of the following is a common DRL benchmark environment?",
"options": [
"ImageNet",
"MNIST",
"OpenAI Gym",
"COCO"
],
"correctAnswerIndex": 2,
"explanation": "OpenAI Gym provides standardized environments for testing DRL algorithms."
},
{
"id": 12,
"questionText": "Why is gradient clipping used in DRL?",
"options": [
"Reduce discount factor",
"Increase rewards artificially",
"Control exploration rate",
"Prevent exploding gradients during neural network training"
],
"correctAnswerIndex": 3,
"explanation": "Gradient clipping limits extreme weight updates, improving stability."
},
{
"id": 13,
"questionText": "Which is true about the replay buffer size in DQN?",
"options": [
"Size does not affect learning",
"Larger buffers store more diverse experiences but use more memory",
"Buffers store only rewards",
"Small buffers always converge faster"
],
"correctAnswerIndex": 1,
"explanation": "Larger buffers provide better sample diversity, improving learning stability."
},
{
"id": 14,
"questionText": "In DRL, what is the main challenge with continuous action spaces?",
"options": [
"Exploration is unnecessary",
"Rewards cannot be used",
"Q-Learning requires discrete actions; approximation methods like DDPG are needed",
"Discount factor cannot be applied"
],
"correctAnswerIndex": 2,
"explanation": "Discrete action Q-learning cannot directly handle continuous actions; actor-critic methods or policy gradients are used."
},
{
"id": 15,
"questionText": "What is the purpose of a policy network in policy gradient methods?",
"options": [
"Estimates Q-values",
"Directly outputs action probabilities for a given state",
"Stores experiences",
"Generates rewards"
],
"correctAnswerIndex": 1,
"explanation": "Policy networks map states to action probabilities, allowing stochastic or deterministic policies."
},
{
"id": 16,
"questionText": "Which DRL method is considered on-policy?",
"options": [
"Double Q-Learning",
"Experience Replay only",
"Deep Q-Network (DQN)",
"REINFORCE"
],
"correctAnswerIndex": 3,
"explanation": "REINFORCE updates the policy based on actions actually taken by the agent, making it on-policy."
},
{
"id": 17,
"questionText": "Which type of neural network is commonly used in DRL for image inputs?",
"options": [
"Recurrent Neural Networks (RNNs) only",
"Fully connected only",
"Autoencoders only",
"Convolutional Neural Networks (CNNs)"
],
"correctAnswerIndex": 3,
"explanation": "CNNs extract spatial features from images for DRL agents like in Atari games."
},
{
"id": 18,
"questionText": "In DRL, what is a major issue with high variance in policy gradient methods?",
"options": [
"Q-values are ignored",
"Exploration becomes deterministic",
"Slow and unstable learning",
"Rewards become negative"
],
"correctAnswerIndex": 2,
"explanation": "High variance can make gradient updates noisy, slowing convergence."
},
{
"id": 19,
"questionText": "Which technique reduces variance in policy gradient updates?",
"options": [
"Use a baseline or advantage function",
"Use greedy policy",
"Increase discount factor",
"Ignore rewards"
],
"correctAnswerIndex": 0,
"explanation": "Subtracting a baseline (like state value) reduces variance while keeping the estimate unbiased."
},
{
"id": 20,
"questionText": "What is the main advantage of Actor-Critic methods over DQN?",
"options": [
"Eliminates exploration",
"Requires tabular Q-table",
"Can handle continuous actions and reduce variance with a value estimator",
"Removes the need for rewards"
],
"correctAnswerIndex": 2,
"explanation": "Actor-Critic combines policy learning (actor) with value estimation (critic) for better performance, especially in continuous domains."
},
{
"id": 21,
"questionText": "Which DRL algorithm is suitable for continuous action control?",
"options": [
"DQN",
"SARSA",
"Deep Deterministic Policy Gradient (DDPG)",
"Monte Carlo only"
],
"correctAnswerIndex": 2,
"explanation": "DDPG can output continuous actions using an actor network and learn value using a critic network."
},
{
"id": 22,
"questionText": "Why are target networks important in DRL?",
"options": [
"Store experience replay",
"Control exploration",
"Generate rewards",
"Stabilize training by reducing oscillations in Q-value targets"
],
"correctAnswerIndex": 3,
"explanation": "Without a target network, the moving Q-value targets cause instability during neural network updates."
},
{
"id": 23,
"questionText": "What is a common solution for partially observable environments in DRL?",
"options": [
"Use Recurrent Neural Networks (RNNs) to remember past states",
"Use tabular Q-Learning",
"Increase discount factor",
"Ignore history"
],
"correctAnswerIndex": 0,
"explanation": "RNNs allow the agent to maintain an internal state, improving decisions in partially observable settings."
},
{
"id": 24,
"questionText": "Which method combines policy gradients and value estimation for stability?",
"options": [
"Monte Carlo only",
"SARSA only",
"Advantage Actor-Critic (A2C)",
"DQN only"
],
"correctAnswerIndex": 2,
"explanation": "A2C uses a critic to estimate value and an actor to update policy, reducing variance and improving learning."
},
{
"id": 25,
"questionText": "In DRL, what is the purpose of epsilon-greedy policy?",
"options": [
"Balance exploration and exploitation",
"Reduce network size",
"Ignore rewards",
"Stabilize gradients"
],
"correctAnswerIndex": 0,
"explanation": "Epsilon-greedy chooses random actions with probability ε to explore the environment while mostly exploiting the best-known action."
},
{
"id": 26,
"questionText": "Which optimization algorithm is commonly used to train DRL networks?",
"options": [
"SGD only",
"Adam",
"None",
"RMSProp only"
],
"correctAnswerIndex": 1,
"explanation": "Adam combines momentum and adaptive learning rates, making it effective for DRL training."
},
{
"id": 27,
"questionText": "What is reward shaping in DRL?",
"options": [
"Changing discount factor",
"Modifying the reward signal to provide intermediate feedback",
"Removing rewards",
"Randomizing actions"
],
"correctAnswerIndex": 1,
"explanation": "Reward shaping provides more frequent feedback to accelerate learning while keeping the optimal policy unchanged."
},
{
"id": 28,
"questionText": "Why is clipping rewards sometimes used in DRL?",
"options": [
"Increase exploration",
"Reduce network size",
"Prevent large gradients and stabilize training",
"Ignore rewards"
],
"correctAnswerIndex": 2,
"explanation": "Clipping avoids extremely large reward signals that can destabilize learning."
},
{
"id": 29,
"questionText": "Which DRL method directly models a stochastic policy?",
"options": [
"Policy Gradient (REINFORCE)",
"DQN",
"SARSA",
"Double Q-Learning"
],
"correctAnswerIndex": 0,
"explanation": "Policy gradients learn a probability distribution over actions, allowing stochastic action selection."
},
{
"id": 30,
"questionText": "In DRL, why is it important to normalize inputs?",
"options": [
"Q-values become zero",
"Rewards are ignored",
"Exploration is unnecessary",
"Neural network training is more stable and faster"
],
"correctAnswerIndex": 3,
"explanation": "Normalized inputs prevent large-scale differences that can hinder learning and slow convergence."
},
{
"id": 31,
"questionText": "What is the key idea behind Double DQN?",
"options": [
"Use two policies to explore the environment",
"Combine policy gradient with Q-learning",
"Update Q-values twice per step",
"Use one network for action selection and another for evaluation to reduce overestimation"
],
"correctAnswerIndex": 3,
"explanation": "Double DQN separates action selection and Q-value evaluation to mitigate overestimation bias seen in standard DQN."
},
{
"id": 32,
"questionText": "What is the purpose of prioritized experience replay?",
"options": [
"Sample important experiences more frequently to improve learning efficiency",
"Ignore old experiences",
"Store only positive rewards",
"Replay experiences in sequential order"
],
"correctAnswerIndex": 0,
"explanation": "Prioritized replay focuses learning on transitions with higher temporal-difference errors, improving convergence speed."
},
{
"id": 33,
"questionText": "Which DRL algorithm is suitable for continuous control tasks?",
"options": [
"DQN",
"Deep Deterministic Policy Gradient (DDPG)",
"Q-Learning",
"SARSA"
],
"correctAnswerIndex": 1,
"explanation": "DDPG can handle continuous action spaces using an actor network to output continuous actions and a critic network to estimate values."
},
{
"id": 34,
"questionText": "In Actor-Critic methods, what is the role of the critic?",
"options": [
"Modify rewards",
"Estimate the value function to guide the actor",
"Store replay memory",
"Select actions randomly"
],
"correctAnswerIndex": 1,
"explanation": "The critic evaluates the current policy by estimating the expected return, providing feedback to the actor for policy improvement."
},
{
"id": 35,
"questionText": "Which advantage does A3C (Asynchronous Advantage Actor-Critic) provide over standard Actor-Critic?",
"options": [
"Removes the need for value estimation",
"Eliminates exploration",
"Uses only one agent to reduce computation",
"Parallel training with multiple agents to stabilize learning"
],
"correctAnswerIndex": 3,
"explanation": "A3C uses multiple asynchronous agents exploring in parallel, which stabilizes learning and improves convergence speed."
},
{
"id": 36,
"questionText": "What is the main challenge of high-dimensional state spaces in DRL?",
"options": [
"Rewards become negative",
"Learning rate α becomes zero",
"Discount factor is ignored",
"Curse of dimensionality increases sample complexity"
],
"correctAnswerIndex": 3,
"explanation": "High-dimensional inputs require more data to learn effective policies and can slow convergence."
},
{
"id": 37,
"questionText": "Which technique helps DRL agents learn from visual input effectively?",
"options": [
"RNNs only",
"Convolutional Neural Networks (CNNs)",
"Decision trees",
"Fully connected networks only"
],
"correctAnswerIndex": 1,
"explanation": "CNNs extract spatial features from images, enabling DRL agents to handle complex visual environments."
},
{
"id": 38,
"questionText": "In DDPG, why is it necessary to add noise to actions during training?",
"options": [
"Promote exploration in continuous action spaces",
"Stabilize the target network",
"Reduce rewards",
"Increase discount factor"
],
"correctAnswerIndex": 0,
"explanation": "Exploration is crucial in continuous action spaces; adding noise ensures the agent explores various actions."
},
{
"id": 39,
"questionText": "What is the purpose of advantage function in A2C or A3C?",
"options": [
"Reduce variance in policy gradient updates",
"Ignore state values",
"Store experiences",
"Increase rewards"
],
"correctAnswerIndex": 0,
"explanation": "Advantage function measures how much better an action is compared to the expected value, reducing variance in updates."
},
{
"id": 40,
"questionText": "Which of the following is a major limitation of vanilla policy gradients?",
"options": [
"Cannot handle discrete actions",
"Requires tabular Q-table",
"High variance in gradient estimates",
"Ignores rewards"
],
"correctAnswerIndex": 2,
"explanation": "Vanilla policy gradients have high variance, making learning slow and unstable."
},
{
"id": 41,
"questionText": "Why is normalization of input features important in DRL?",
"options": [
"Increases rewards artificially",
"Stabilizes neural network training and improves convergence",
"Reduces exploration",
"Removes discount factor"
],
"correctAnswerIndex": 1,
"explanation": "Normalization prevents large-scale differences that could destabilize learning and slow down convergence."
},
{
"id": 42,
"questionText": "In DRL, what is the role of target smoothing in DDPG?",
"options": [
"Prevent oscillations by slowly updating target networks",
"Ignore discount factor",
"Randomize actions",
"Generate rewards"
],
"correctAnswerIndex": 0,
"explanation": "Soft updates of the target network improve training stability by avoiding large sudden changes in Q-values."
},
{
"id": 43,
"questionText": "What does the term 'on-policy' mean in DRL?",
"options": [
"Agent stores experiences only",
"Agent ignores rewards",
"Agent uses a separate policy for evaluation",
"Agent updates policy using actions it actually takes"
],
"correctAnswerIndex": 3,
"explanation": "On-policy methods learn the value of the policy being executed, unlike off-policy methods which can learn from other policies."
},
{
"id": 44,
"questionText": "What does 'off-policy' learning in DRL allow?",
"options": [
"Ignoring rewards",
"Reducing discount factor to zero",
"Only learning from current policy",
"Learning optimal policy using experiences from a different behavior policy"
],
"correctAnswerIndex": 3,
"explanation": "Off-policy learning allows using past experiences or exploratory actions to learn the optimal policy."
},
{
"id": 45,
"questionText": "Which method is used to reduce correlation between consecutive samples in DRL?",
"options": [
"Target networks only",
"Policy gradient",
"Greedy policy",
"Experience replay"
],
"correctAnswerIndex": 3,
"explanation": "Experience replay randomly samples past experiences, breaking temporal correlations and improving learning stability."
},
{
"id": 46,
"questionText": "Which DRL algorithm is suitable for environments with discrete action spaces?",
"options": [
"DDPG",
"Policy Gradient with continuous actor",
"SARSA only",
"Deep Q-Network (DQN)"
],
"correctAnswerIndex": 3,
"explanation": "DQN works well in discrete action spaces by estimating Q-values for all possible actions."
},
{
"id": 47,
"questionText": "Why is reward clipping sometimes applied in DRL?",
"options": [
"Normalize inputs",
"Prevent very large rewards from destabilizing training",
"Increase exploration",
"Reduce discount factor"
],
"correctAnswerIndex": 1,
"explanation": "Clipping rewards prevents extreme updates in the network that could destabilize learning."
},
{
"id": 48,
"questionText": "What is the main advantage of using Actor-Critic over pure policy gradients?",
"options": [
"Removes discount factor",
"No neural network required",
"Eliminates need for exploration",
"Reduced variance and better sample efficiency"
],
"correctAnswerIndex": 3,
"explanation": "The critic estimates value function to guide the actor, reducing variance compared to vanilla policy gradient."
},
{
"id": 49,
"questionText": "In DRL, why is gradient clipping applied?",
"options": [
"Increase discount factor",
"Prevent exploding gradients and stabilize learning",
"Store experiences",
"Normalize inputs"
],
"correctAnswerIndex": 1,
"explanation": "Clipping gradient magnitudes ensures neural network weights do not change abruptly, preventing instability."
},
{
"id": 50,
"questionText": "Which DRL algorithm can handle both discrete and continuous action spaces with separate actor and critic networks?",
"options": [
"SARSA only",
"Actor-Critic / DDPG",
"REINFORCE only",
"DQN only"
],
"correctAnswerIndex": 1,
"explanation": "Actor-Critic methods and DDPG separate policy and value networks, allowing application in both discrete and continuous domains."
},
{
"id": 51,
"questionText": "What is the main idea behind Advantage Actor-Critic (A2C)?",
"options": [
"Store experiences for replay",
"Ignore policy updates",
"Only use the critic for evaluation",
"Use the advantage function to reduce variance in policy updates"
],
"correctAnswerIndex": 3,
"explanation": "Advantage function improves learning stability by comparing action value against expected value for the state."
},
{
"id": 52,
"questionText": "In DRL, what is a major issue with partial observability?",
"options": [
"Discount factor cannot be applied",
"Q-values are ignored",
"Agent does not have full knowledge of the environment, making decision-making harder",
"Rewards become deterministic"
],
"correctAnswerIndex": 2,
"explanation": "Partial observability requires the agent to infer hidden state information, often handled with RNNs."
},
{
"id": 53,
"questionText": "Which technique improves exploration in continuous action DRL algorithms?",
"options": [
"Clipping rewards",
"Use deterministic greedy policy only",
"Add noise (e.g., Ornstein-Uhlenbeck process in DDPG) to actor outputs",
"Reducing discount factor"
],
"correctAnswerIndex": 2,
"explanation": "Adding noise ensures the agent explores diverse actions in continuous spaces."
},
{
"id": 54,
"questionText": "What is the purpose of soft updates in target networks?",
"options": [
"Increase exploration",
"Smoothly update target network parameters to improve stability",
"Ignore experience replay",
"Clip rewards"
],
"correctAnswerIndex": 1,
"explanation": "Soft updates prevent large jumps in Q-value targets, stabilizing training."
},
{
"id": 55,
"questionText": "Which DRL algorithm is particularly suitable for large discrete action spaces?",
"options": [
"DDPG",
"Dueling DQN",
"SARSA only",
"Policy Gradient only"
],
"correctAnswerIndex": 1,
"explanation": "Dueling DQN separates state-value and advantage function, allowing efficient learning in large discrete action spaces."
},
{
"id": 56,
"questionText": "Why is advantage function useful in policy gradient methods?",
"options": [
"Removes need for rewards",
"Eliminates discount factor",
"Reduces variance without introducing bias",
"Stores experiences"
],
"correctAnswerIndex": 2,
"explanation": "By comparing action value to baseline, variance in gradient estimates decreases, improving stability."
},
{
"id": 57,
"questionText": "In DRL, what is entropy regularization?",
"options": [
"Reduce rewards",
"Encourage exploration by adding entropy of the policy to the loss function",
"Clips gradients",
"Store experience replay"
],
"correctAnswerIndex": 1,
"explanation": "Entropy regularization prevents premature convergence to deterministic policies, encouraging exploration."
},
{
"id": 58,
"questionText": "Which neural network is used to handle sequences in partially observable DRL tasks?",
"options": [
"Decision trees",
"Fully connected networks",
"Recurrent Neural Networks (RNNs)",
"CNNs only"
],
"correctAnswerIndex": 2,
"explanation": "RNNs maintain hidden states over time, allowing the agent to infer information from past observations."
},
{
"id": 59,
"questionText": "Why is target network in DQN updated periodically?",
"options": [
"Clip gradients",
"Reduce oscillations and stabilize learning",
"Increase rewards",
"Reduce exploration"
],
"correctAnswerIndex": 1,
"explanation": "Periodic updates provide fixed targets for several steps, preventing divergence."
},
{
"id": 60,
"questionText": "What is the main difference between DDPG and DQN?",
"options": [
"DQN uses actor-critic; DDPG does not",
"DDPG handles continuous actions; DQN handles discrete actions",
"DQN is on-policy",
"DDPG requires tabular Q-table"
],
"correctAnswerIndex": 1,
"explanation": "DDPG uses actor-critic for continuous actions, while DQN uses Q-value approximations for discrete actions."
},
{
"id": 61,
"questionText": "What is the role of the critic in Actor-Critic methods?",
"options": [
"Estimate value function to evaluate actions",
"Normalize inputs",
"Select random actions",
"Clip rewards"
],
"correctAnswerIndex": 0,
"explanation": "The critic evaluates the policy by providing feedback on the quality of actions, guiding the actor."
},
{
"id": 62,
"questionText": "Which DRL method is designed for multi-agent asynchronous training?",
"options": [
"DQN",
"SARSA",
"DDPG",
"A3C"
],
"correctAnswerIndex": 3,
"explanation": "A3C uses multiple agents training in parallel, improving efficiency and stability."
},
{
"id": 63,
"questionText": "Which approach addresses overestimation in Q-values in DRL?",
"options": [
"Double DQN",
"Actor-Critic",
"Policy gradient",
"DQN only"
],
"correctAnswerIndex": 0,
"explanation": "Double DQN separates selection and evaluation, reducing overestimation bias in Q-learning."
},
{
"id": 64,
"questionText": "Which DRL algorithm uses deterministic policy for continuous control?",
"options": [
"DQN",
"A2C",
"DDPG",
"REINFORCE"
],
"correctAnswerIndex": 2,
"explanation": "DDPG outputs deterministic actions from the actor network, suitable for continuous action environments."
},
{
"id": 65,
"questionText": "Why is reward shaping useful in DRL?",
"options": [
"Eliminates exploration",
"Removes discount factor",
"Provides intermediate rewards to accelerate learning",
"Stores experiences"
],
"correctAnswerIndex": 2,
"explanation": "Shaping rewards gives the agent feedback on progress towards goals, improving convergence speed."
},
{
"id": 66,
"questionText": "Which technique reduces variance in policy gradient methods?",
"options": [
"Using advantage function or baseline",
"Increasing learning rate",
"Reducing discount factor",
"Clipping rewards"
],
"correctAnswerIndex": 0,
"explanation": "Subtracting a baseline from the return reduces variance while keeping gradient estimates unbiased."
},
{
"id": 67,
"questionText": "In DRL, why is exploration important?",
"options": [
"Ignore rewards",
"Store experiences",
"Ensure agent discovers optimal actions rather than exploiting suboptimal known actions",
"Reduce discount factor"
],
"correctAnswerIndex": 2,
"explanation": "Exploration allows the agent to learn about the environment and avoid getting stuck in local optima."
},
{
"id": 68,
"questionText": "Which problem does partial observability introduce in DRL?",
"options": [
"Learning rate becomes zero",
"Rewards become deterministic",
"Agent cannot fully observe the environment state, making decision-making harder",
"Discount factor is ignored"
],
"correctAnswerIndex": 2,
"explanation": "Partial observability requires the agent to maintain internal memory or inference to act effectively."
},
{
"id": 69,
"questionText": "Which DRL algorithm is on-policy?",
"options": [
"DQN",
"A2C",
"Double DQN",
"DDPG"
],
"correctAnswerIndex": 1,
"explanation": "A2C updates the policy based on actions actually taken, making it on-policy."
},
{
"id": 70,
"questionText": "Why is entropy regularization used in policy gradient DRL?",
"options": [
"Reduce rewards",
"Encourage exploration by preventing premature convergence to deterministic policies",
"Clip gradients",
"Normalize inputs"
],
"correctAnswerIndex": 1,
"explanation": "Entropy regularization adds a term to the loss to favor higher-entropy (more exploratory) policies."
},
{
"id": 71,
"questionText": "A robot using DDPG in a continuous action space keeps colliding with obstacles. What is the best approach?",
"options": [
"Reduce discount factor to zero",
"Use greedy deterministic policy only",
"Modify the reward function to penalize collisions heavily",
"Ignore collisions and continue training"
],
"correctAnswerIndex": 2,
"explanation": "Reward shaping helps the agent learn safer actions while maintaining exploration."
},
{
"id": 72,
"questionText": "A DRL agent trained with DQN in a stochastic environment overestimates Q-values. What modification can help?",
"options": [
"Use Double DQN to separate action selection and evaluation",
"Use on-policy updates only",
"Ignore rewards",
"Increase learning rate drastically"
],
"correctAnswerIndex": 0,
"explanation": "Double DQN mitigates overestimation by using separate networks for selection and evaluation."
},
{
"id": 73,
"questionText": "During training, a DRL agent’s policy oscillates and does not converge. What is a likely cause?",
"options": [
"High variance in policy gradients or unstable target updates",
"No experience replay used",
"Discount factor too low",
"Low rewards"
],
"correctAnswerIndex": 0,
"explanation": "High variance and unstable updates can cause oscillations; techniques like advantage function or target smoothing help stabilize learning."
},
{
"id": 74,
"questionText": "A multi-agent DRL environment suffers from slow learning. Which approach can improve training efficiency?",
"options": [
"Ignore rewards",
"Reduce network size drastically",
"Use A3C with multiple asynchronous agents",
"Use deterministic greedy policy only"
],
"correctAnswerIndex": 2,
"explanation": "Asynchronous agents explore in parallel, speeding up learning and stabilizing convergence."
},
{
"id": 75,
"questionText": "An agent using policy gradients receives sparse rewards, making learning slow. How can this be mitigated?",
"options": [
"Reduce discount factor",
"Apply reward shaping to provide intermediate feedback",
"Ignore sparse rewards",
"Use deterministic actions only"
],
"correctAnswerIndex": 1,
"explanation": "Reward shaping provides more frequent signals to accelerate learning in sparse-reward environments."
},
{
"id": 76,
"questionText": "During DRL training with continuous actions, exploration is insufficient. What should be done?",
"options": [
"Set discount factor to zero",
"Remove reward signals",
"Add noise (e.g., Ornstein-Uhlenbeck) to actor outputs",
"Use only greedy policy"
],
"correctAnswerIndex": 2,
"explanation": "Adding noise ensures exploration in continuous action spaces, helping the agent discover better policies."
},
{
"id": 77,
"questionText": "A DRL agent trained with DQN is unstable and diverging. Which technique can stabilize training?",
"options": [
"Set discount factor to zero",
"Ignore rewards",
"Use target networks and experience replay",
"Reduce network capacity drastically"
],
"correctAnswerIndex": 2,
"explanation": "Target networks and experience replay break correlations and provide stable Q-value targets, improving convergence."
},
{
"id": 78,
"questionText": "An agent using Actor-Critic has slow convergence due to high gradient variance. What is a solution?",
"options": [
"Remove critic network",
"Ignore rewards",
"Use advantage function or baseline to reduce variance",
"Increase discount factor to 1"
],
"correctAnswerIndex": 2,
"explanation": "Advantage function compares action value to expected state value, reducing variance without biasing updates."
},
{
"id": 79,
"questionText": "A DRL agent trained in partially observable environment fails to act optimally. Which method can help?",
"options": [
"Use Recurrent Neural Networks to maintain internal memory",
"Remove actor network",
"Use DQN only",
"Increase discount factor to 1"
],
"correctAnswerIndex": 0,
"explanation": "RNNs allow the agent to remember past observations, improving decisions under partial observability."
},
{
"id": 80,
"questionText": "During continuous control DRL, Q-values fluctuate wildly. What can help stabilize learning?",
"options": [
"Soft updates of target networks and smaller learning rates",
"Ignore rewards",
"Remove exploration noise",
"Reduce discount factor to zero"
],
"correctAnswerIndex": 0,
"explanation": "Soft target updates and cautious learning rates prevent large oscillations in value estimates."
},
{
"id": 81,
"questionText": "A DRL agent in a robotics task learns slowly due to sparse reward signals. What technique can accelerate learning?",
"options": [
"Reduce learning rate to zero",
"Remove critic network",
"Apply reward shaping with intermediate rewards",
"Increase discount factor to 1.0"
],
"correctAnswerIndex": 2,
"explanation": "Reward shaping provides denser feedback, helping the agent learn meaningful behaviors faster."
},
{
"id": 82,
"questionText": "In a stochastic environment, a DQN agent overestimates some Q-values. Which approach helps?",
"options": [
"Use on-policy updates only",
"Use Double DQN to decouple selection and evaluation",
"Ignore replay buffer",
"Reduce discount factor to zero"
],
"correctAnswerIndex": 1,
"explanation": "Double DQN reduces overestimation bias by separating action selection and Q-value evaluation."
},
{
"id": 83,
"questionText": "An agent using DDPG shows poor exploration. What is the most effective solution?",
"options": [
"Use deterministic greedy policy",
"Reduce discount factor",
"Add temporally correlated noise to the actor actions",
"Ignore reward signals"
],
"correctAnswerIndex": 2,
"explanation": "Temporally correlated noise (e.g., Ornstein-Uhlenbeck) encourages effective exploration in continuous action spaces."
},
{
"id": 84,
"questionText": "During training, a policy gradient agent exhibits high variance. What strategy reduces it?",
"options": [
"Remove reward signals",
"Subtract a baseline or use advantage function",
"Reduce discount factor to zero",
"Increase learning rate drastically"
],
"correctAnswerIndex": 1,
"explanation": "Using a baseline reduces the variance of gradient estimates while maintaining unbiased updates."
},
{
"id": 85,
"questionText": "A partially observable DRL environment prevents the agent from seeing the full state. What is the solution?",
"options": [
"Increase learning rate",
"Use RNNs or LSTMs to retain past observations",
"Remove reward shaping",
"Use DQN only"
],
"correctAnswerIndex": 1,
"explanation": "RNNs or LSTMs provide memory of past states, allowing better decision-making despite partial observability."
},
{
"id": 86,
"questionText": "In multi-agent DRL, agents’ policies interfere with each other, causing instability. Which method can help?",
"options": [
"Reduce discount factor to zero",
"Use independent learning or centralized training with decentralized execution",
"Ignore rewards",
"Remove actor network"
],
"correctAnswerIndex": 1,
"explanation": "Centralized training stabilizes learning by considering other agents’ actions while still allowing decentralized execution."
},
{
"id": 87,
"questionText": "An agent’s policy converges to suboptimal deterministic behavior too early. Which method encourages exploration?",
"options": [
"Remove critic network",
"Ignore reward shaping",
"Increase discount factor to 1",
"Add entropy regularization to the loss function"
],
"correctAnswerIndex": 3,
"explanation": "Entropy regularization encourages stochastic actions, preventing premature convergence."
},
{
"id": 88,
"questionText": "During DRL training, target Q-values fluctuate wildly causing instability. Which adjustment helps?",
"options": [
"Use soft updates for target networks",
"Use deterministic actions only",
"Remove replay buffer",
"Reduce reward magnitude to zero"
],
"correctAnswerIndex": 0,
"explanation": "Soft updates reduce sudden changes in target Q-values, stabilizing training."
},
{
"id": 89,
"questionText": "An agent trained in a sparse reward environment fails to discover optimal behavior. What can help?",
"options": [
"Introduce shaped or auxiliary rewards for intermediate goals",
"Remove actor network",
"Reduce learning rate to zero",
"Use deterministic greedy policy"
],
"correctAnswerIndex": 0,
"explanation": "Shaped rewards provide more frequent feedback, helping the agent learn useful behaviors."
},
{
"id": 90,
"questionText": "During training, a continuous control DRL agent oscillates near optimal policy. What adjustment helps?",
"options": [
"Reduce learning rate and apply soft target updates",
"Reduce discount factor to zero",
"Remove actor network",
"Ignore reward signals"
],
"correctAnswerIndex": 0,
"explanation": "Small learning rates and soft target updates prevent large weight changes, reducing oscillations."
},
{
"id": 91,
"questionText": "A robotic arm using DDPG reaches the target inconsistently. Which technique can improve stability?",
"options": [
"Use target smoothing and reward shaping",
"Ignore experience replay",
"Reduce discount factor",
"Remove actor network"
],
"correctAnswerIndex": 0,
"explanation": "Target smoothing stabilizes Q-value estimates, and reward shaping guides the agent towards correct behavior."
},
{
"id": 92,
"questionText": "A DRL agent in a stochastic maze overestimates Q-values. What solution helps?",
"options": [
"Reduce exploration",
"Use Double DQN",
"Ignore rewards",
"Use deterministic greedy policy"
],
"correctAnswerIndex": 1,
"explanation": "Double DQN mitigates overestimation by decoupling action selection from evaluation."
},
{
"id": 93,
"questionText": "An agent shows slow learning due to correlated sequential samples. Which technique helps?",
"options": [
"Ignore rewards",
"Experience replay with random sampling",
"Reduce discount factor",
"Remove critic network"
],
"correctAnswerIndex": 1,
"explanation": "Random sampling from replay memory breaks temporal correlations, improving stability and convergence."
},
{
"id": 94,
"questionText": "In a partially observable environment, an agent fails to infer state. Which method can help?",
"options": [
"Increase learning rate",
"Use DQN only",
"Remove reward shaping",
"Use RNNs or LSTMs to encode history"
],
"correctAnswerIndex": 3,
"explanation": "RNNs or LSTMs maintain memory of past observations, allowing better state inference."
},
{
"id": 95,
"questionText": "An agent trained in continuous control fails to explore. Which solution improves performance?",
"options": [
"Use deterministic policy only",
"Reduce discount factor",
"Add temporally correlated noise to actions",
"Ignore reward shaping"
],
"correctAnswerIndex": 2,
"explanation": "Temporally correlated noise encourages exploration in continuous action spaces."
},
{
"id": 96,
"questionText": "During DRL training, an agent converges to a suboptimal deterministic policy. How to improve?",
"options": [
"Add entropy regularization to encourage stochasticity",
"Use DQN only",
"Reduce learning rate",
"Ignore rewards"
],
"correctAnswerIndex": 0,
"explanation": "Entropy regularization prevents premature convergence to deterministic policies, encouraging exploration."
},
{
"id": 97,
"questionText": "A DRL agent trained in a high-dimensional visual environment struggles. Which network helps?",
"options": [
"Decision trees",
"RNNs only",
"Convolutional Neural Networks (CNNs)",
"Fully connected networks only"
],
"correctAnswerIndex": 2,
"explanation": "CNNs extract spatial features from images, enabling learning in complex visual environments."
},
{
"id": 98,
"questionText": "An agent’s Q-values explode during training in continuous control. What helps?",
"options": [
"Use deterministic greedy policy only",
"Remove actor network",
"Gradient clipping and smaller learning rates",
"Increase rewards drastically"
],
"correctAnswerIndex": 2,
"explanation": "Gradient clipping prevents large updates that destabilize learning in DRL networks."
},
{
"id": 99,
"questionText": "In a multi-agent environment, agents’ interactions destabilize learning. What can help?",
"options": [
"Centralized training with decentralized execution",
"Remove actor network",
"Reduce discount factor to zero",
"Ignore rewards"
],
"correctAnswerIndex": 0,
"explanation": "Centralized training considers interactions, while decentralized execution allows individual agents to act independently."
},
{
"id": 100,
"questionText": "A robotic agent using DRL performs poorly after transferring from simulation to real world. What can help?",
"options": [
"Remove actor network",
"Use deterministic greedy policy",
"Domain randomization and fine-tuning in real environment",
"Reduce discount factor to zero"
],
"correctAnswerIndex": 2,
"explanation": "Domain randomization improves robustness to variations, and fine-tuning adapts the policy to real-world dynamics."
}
]
}
|