| { |
| "best_metric": 1.719967246055603, |
| "best_model_checkpoint": "ckpts/sft_OLMo-1B-hf/checkpoint-940", |
| "epoch": 4.96042216358839, |
| "eval_steps": 20, |
| "global_step": 940, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.10554089709762533, |
| "grad_norm": 10.3125, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 2.4799, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10554089709762533, |
| "eval_loss": 2.3184142112731934, |
| "eval_runtime": 4.0729, |
| "eval_samples_per_second": 49.105, |
| "eval_steps_per_second": 12.276, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.21108179419525067, |
| "grad_norm": 7.46875, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 2.2916, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.21108179419525067, |
| "eval_loss": 2.265637159347534, |
| "eval_runtime": 4.034, |
| "eval_samples_per_second": 49.579, |
| "eval_steps_per_second": 12.395, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.316622691292876, |
| "grad_norm": 7.1875, |
| "learning_rate": 1e-05, |
| "loss": 2.2737, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.316622691292876, |
| "eval_loss": 2.2540640830993652, |
| "eval_runtime": 3.9951, |
| "eval_samples_per_second": 50.062, |
| "eval_steps_per_second": 12.515, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.42216358839050133, |
| "grad_norm": 7.25, |
| "learning_rate": 1e-05, |
| "loss": 2.1889, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.42216358839050133, |
| "eval_loss": 2.2447378635406494, |
| "eval_runtime": 4.3965, |
| "eval_samples_per_second": 45.491, |
| "eval_steps_per_second": 11.373, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5277044854881267, |
| "grad_norm": 8.0, |
| "learning_rate": 1e-05, |
| "loss": 2.2005, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5277044854881267, |
| "eval_loss": 2.225715398788452, |
| "eval_runtime": 4.0633, |
| "eval_samples_per_second": 49.221, |
| "eval_steps_per_second": 12.305, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.633245382585752, |
| "grad_norm": 7.46875, |
| "learning_rate": 1e-05, |
| "loss": 2.1915, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.633245382585752, |
| "eval_loss": 2.208789587020874, |
| "eval_runtime": 4.4371, |
| "eval_samples_per_second": 45.075, |
| "eval_steps_per_second": 11.269, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.7387862796833773, |
| "grad_norm": 7.875, |
| "learning_rate": 1e-05, |
| "loss": 2.2115, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.7387862796833773, |
| "eval_loss": 2.189687728881836, |
| "eval_runtime": 4.427, |
| "eval_samples_per_second": 45.177, |
| "eval_steps_per_second": 11.294, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.8443271767810027, |
| "grad_norm": 7.28125, |
| "learning_rate": 1e-05, |
| "loss": 2.1754, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.8443271767810027, |
| "eval_loss": 2.1662068367004395, |
| "eval_runtime": 4.4352, |
| "eval_samples_per_second": 45.094, |
| "eval_steps_per_second": 11.273, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.9498680738786279, |
| "grad_norm": 7.4375, |
| "learning_rate": 1e-05, |
| "loss": 2.1529, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.9498680738786279, |
| "eval_loss": 2.151796340942383, |
| "eval_runtime": 4.578, |
| "eval_samples_per_second": 43.688, |
| "eval_steps_per_second": 10.922, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.0554089709762533, |
| "grad_norm": 7.4375, |
| "learning_rate": 1e-05, |
| "loss": 2.0596, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.0554089709762533, |
| "eval_loss": 2.150334119796753, |
| "eval_runtime": 4.3064, |
| "eval_samples_per_second": 46.443, |
| "eval_steps_per_second": 11.611, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.1609498680738786, |
| "grad_norm": 8.3125, |
| "learning_rate": 1e-05, |
| "loss": 1.9336, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.1609498680738786, |
| "eval_loss": 2.138848066329956, |
| "eval_runtime": 4.7845, |
| "eval_samples_per_second": 41.802, |
| "eval_steps_per_second": 10.45, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.266490765171504, |
| "grad_norm": 8.125, |
| "learning_rate": 1e-05, |
| "loss": 1.917, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.266490765171504, |
| "eval_loss": 2.1307833194732666, |
| "eval_runtime": 4.549, |
| "eval_samples_per_second": 43.966, |
| "eval_steps_per_second": 10.992, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.3720316622691293, |
| "grad_norm": 8.375, |
| "learning_rate": 1e-05, |
| "loss": 1.9214, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.3720316622691293, |
| "eval_loss": 2.11213755607605, |
| "eval_runtime": 4.4528, |
| "eval_samples_per_second": 44.916, |
| "eval_steps_per_second": 11.229, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.4775725593667546, |
| "grad_norm": 8.25, |
| "learning_rate": 1e-05, |
| "loss": 1.9631, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.4775725593667546, |
| "eval_loss": 2.0881500244140625, |
| "eval_runtime": 4.489, |
| "eval_samples_per_second": 44.553, |
| "eval_steps_per_second": 11.138, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.58311345646438, |
| "grad_norm": 8.5, |
| "learning_rate": 1e-05, |
| "loss": 1.8888, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.58311345646438, |
| "eval_loss": 2.0727522373199463, |
| "eval_runtime": 4.6234, |
| "eval_samples_per_second": 43.258, |
| "eval_steps_per_second": 10.815, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.6886543535620053, |
| "grad_norm": 8.75, |
| "learning_rate": 1e-05, |
| "loss": 1.8634, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.6886543535620053, |
| "eval_loss": 2.0583410263061523, |
| "eval_runtime": 4.3979, |
| "eval_samples_per_second": 45.476, |
| "eval_steps_per_second": 11.369, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.7941952506596306, |
| "grad_norm": 9.625, |
| "learning_rate": 1e-05, |
| "loss": 1.8716, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.7941952506596306, |
| "eval_loss": 2.0440073013305664, |
| "eval_runtime": 4.4336, |
| "eval_samples_per_second": 45.111, |
| "eval_steps_per_second": 11.278, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.899736147757256, |
| "grad_norm": 8.625, |
| "learning_rate": 1e-05, |
| "loss": 1.8626, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.899736147757256, |
| "eval_loss": 2.027642011642456, |
| "eval_runtime": 4.5994, |
| "eval_samples_per_second": 43.484, |
| "eval_steps_per_second": 10.871, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.005277044854881, |
| "grad_norm": 10.0625, |
| "learning_rate": 1e-05, |
| "loss": 1.8374, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.005277044854881, |
| "eval_loss": 2.023581027984619, |
| "eval_runtime": 4.513, |
| "eval_samples_per_second": 44.316, |
| "eval_steps_per_second": 11.079, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.1108179419525066, |
| "grad_norm": 10.3125, |
| "learning_rate": 1e-05, |
| "loss": 1.6156, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.1108179419525066, |
| "eval_loss": 2.034921169281006, |
| "eval_runtime": 4.3548, |
| "eval_samples_per_second": 45.926, |
| "eval_steps_per_second": 11.482, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.216358839050132, |
| "grad_norm": 10.5, |
| "learning_rate": 1e-05, |
| "loss": 1.571, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.216358839050132, |
| "eval_loss": 2.0253899097442627, |
| "eval_runtime": 4.4896, |
| "eval_samples_per_second": 44.547, |
| "eval_steps_per_second": 11.137, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.321899736147757, |
| "grad_norm": 10.9375, |
| "learning_rate": 1e-05, |
| "loss": 1.5824, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.321899736147757, |
| "eval_loss": 2.000455141067505, |
| "eval_runtime": 4.3294, |
| "eval_samples_per_second": 46.195, |
| "eval_steps_per_second": 11.549, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.4274406332453826, |
| "grad_norm": 11.25, |
| "learning_rate": 1e-05, |
| "loss": 1.532, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.4274406332453826, |
| "eval_loss": 2.0012362003326416, |
| "eval_runtime": 4.4248, |
| "eval_samples_per_second": 45.2, |
| "eval_steps_per_second": 11.3, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.532981530343008, |
| "grad_norm": 10.875, |
| "learning_rate": 1e-05, |
| "loss": 1.538, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.532981530343008, |
| "eval_loss": 1.9685580730438232, |
| "eval_runtime": 4.6986, |
| "eval_samples_per_second": 42.566, |
| "eval_steps_per_second": 10.642, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.638522427440633, |
| "grad_norm": 11.4375, |
| "learning_rate": 1e-05, |
| "loss": 1.5482, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.638522427440633, |
| "eval_loss": 1.945511817932129, |
| "eval_runtime": 4.3643, |
| "eval_samples_per_second": 45.826, |
| "eval_steps_per_second": 11.457, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.7440633245382586, |
| "grad_norm": 11.5625, |
| "learning_rate": 1e-05, |
| "loss": 1.5028, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.7440633245382586, |
| "eval_loss": 1.9389147758483887, |
| "eval_runtime": 4.5184, |
| "eval_samples_per_second": 44.263, |
| "eval_steps_per_second": 11.066, |
| "step": 520 |
| }, |
| { |
| "epoch": 2.849604221635884, |
| "grad_norm": 12.5625, |
| "learning_rate": 1e-05, |
| "loss": 1.4947, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.849604221635884, |
| "eval_loss": 1.9430372714996338, |
| "eval_runtime": 4.4992, |
| "eval_samples_per_second": 44.453, |
| "eval_steps_per_second": 11.113, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.955145118733509, |
| "grad_norm": 11.9375, |
| "learning_rate": 1e-05, |
| "loss": 1.5243, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.955145118733509, |
| "eval_loss": 1.9145066738128662, |
| "eval_runtime": 4.4679, |
| "eval_samples_per_second": 44.764, |
| "eval_steps_per_second": 11.191, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.0606860158311346, |
| "grad_norm": 15.0, |
| "learning_rate": 1e-05, |
| "loss": 1.3297, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.0606860158311346, |
| "eval_loss": 1.9249849319458008, |
| "eval_runtime": 4.5014, |
| "eval_samples_per_second": 44.43, |
| "eval_steps_per_second": 11.108, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.16622691292876, |
| "grad_norm": 13.25, |
| "learning_rate": 1e-05, |
| "loss": 1.21, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.16622691292876, |
| "eval_loss": 1.9324084520339966, |
| "eval_runtime": 4.7117, |
| "eval_samples_per_second": 42.447, |
| "eval_steps_per_second": 10.612, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.271767810026385, |
| "grad_norm": 15.875, |
| "learning_rate": 1e-05, |
| "loss": 1.2001, |
| "step": 620 |
| }, |
| { |
| "epoch": 3.271767810026385, |
| "eval_loss": 1.9431959390640259, |
| "eval_runtime": 4.4958, |
| "eval_samples_per_second": 44.486, |
| "eval_steps_per_second": 11.121, |
| "step": 620 |
| }, |
| { |
| "epoch": 3.3773087071240107, |
| "grad_norm": 15.125, |
| "learning_rate": 1e-05, |
| "loss": 1.1686, |
| "step": 640 |
| }, |
| { |
| "epoch": 3.3773087071240107, |
| "eval_loss": 1.9009323120117188, |
| "eval_runtime": 4.3205, |
| "eval_samples_per_second": 46.291, |
| "eval_steps_per_second": 11.573, |
| "step": 640 |
| }, |
| { |
| "epoch": 3.4828496042216357, |
| "grad_norm": 16.5, |
| "learning_rate": 1e-05, |
| "loss": 1.1798, |
| "step": 660 |
| }, |
| { |
| "epoch": 3.4828496042216357, |
| "eval_loss": 1.8920202255249023, |
| "eval_runtime": 4.5772, |
| "eval_samples_per_second": 43.695, |
| "eval_steps_per_second": 10.924, |
| "step": 660 |
| }, |
| { |
| "epoch": 3.588390501319261, |
| "grad_norm": 14.1875, |
| "learning_rate": 1e-05, |
| "loss": 1.197, |
| "step": 680 |
| }, |
| { |
| "epoch": 3.588390501319261, |
| "eval_loss": 1.8691601753234863, |
| "eval_runtime": 4.5889, |
| "eval_samples_per_second": 43.584, |
| "eval_steps_per_second": 10.896, |
| "step": 680 |
| }, |
| { |
| "epoch": 3.6939313984168867, |
| "grad_norm": 15.25, |
| "learning_rate": 1e-05, |
| "loss": 1.1745, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.6939313984168867, |
| "eval_loss": 1.8563519716262817, |
| "eval_runtime": 4.09, |
| "eval_samples_per_second": 48.9, |
| "eval_steps_per_second": 12.225, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.7994722955145117, |
| "grad_norm": 16.125, |
| "learning_rate": 1e-05, |
| "loss": 1.1083, |
| "step": 720 |
| }, |
| { |
| "epoch": 3.7994722955145117, |
| "eval_loss": 1.8388882875442505, |
| "eval_runtime": 4.1971, |
| "eval_samples_per_second": 47.653, |
| "eval_steps_per_second": 11.913, |
| "step": 720 |
| }, |
| { |
| "epoch": 3.905013192612137, |
| "grad_norm": 16.25, |
| "learning_rate": 1e-05, |
| "loss": 1.1325, |
| "step": 740 |
| }, |
| { |
| "epoch": 3.905013192612137, |
| "eval_loss": 1.8317779302597046, |
| "eval_runtime": 4.353, |
| "eval_samples_per_second": 45.945, |
| "eval_steps_per_second": 11.486, |
| "step": 740 |
| }, |
| { |
| "epoch": 4.010554089709762, |
| "grad_norm": 16.25, |
| "learning_rate": 1e-05, |
| "loss": 1.0731, |
| "step": 760 |
| }, |
| { |
| "epoch": 4.010554089709762, |
| "eval_loss": 1.8252906799316406, |
| "eval_runtime": 4.5472, |
| "eval_samples_per_second": 43.983, |
| "eval_steps_per_second": 10.996, |
| "step": 760 |
| }, |
| { |
| "epoch": 4.116094986807388, |
| "grad_norm": 15.75, |
| "learning_rate": 1e-05, |
| "loss": 0.8763, |
| "step": 780 |
| }, |
| { |
| "epoch": 4.116094986807388, |
| "eval_loss": 1.8407686948776245, |
| "eval_runtime": 4.341, |
| "eval_samples_per_second": 46.073, |
| "eval_steps_per_second": 11.518, |
| "step": 780 |
| }, |
| { |
| "epoch": 4.221635883905013, |
| "grad_norm": 19.25, |
| "learning_rate": 1e-05, |
| "loss": 0.8789, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.221635883905013, |
| "eval_loss": 1.836584210395813, |
| "eval_runtime": 4.5537, |
| "eval_samples_per_second": 43.921, |
| "eval_steps_per_second": 10.98, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.327176781002638, |
| "grad_norm": 18.625, |
| "learning_rate": 1e-05, |
| "loss": 0.8585, |
| "step": 820 |
| }, |
| { |
| "epoch": 4.327176781002638, |
| "eval_loss": 1.826670527458191, |
| "eval_runtime": 4.5393, |
| "eval_samples_per_second": 44.06, |
| "eval_steps_per_second": 11.015, |
| "step": 820 |
| }, |
| { |
| "epoch": 4.432717678100264, |
| "grad_norm": 18.0, |
| "learning_rate": 1e-05, |
| "loss": 0.7994, |
| "step": 840 |
| }, |
| { |
| "epoch": 4.432717678100264, |
| "eval_loss": 1.823104977607727, |
| "eval_runtime": 4.5835, |
| "eval_samples_per_second": 43.635, |
| "eval_steps_per_second": 10.909, |
| "step": 840 |
| }, |
| { |
| "epoch": 4.538258575197889, |
| "grad_norm": 17.125, |
| "learning_rate": 1e-05, |
| "loss": 0.828, |
| "step": 860 |
| }, |
| { |
| "epoch": 4.538258575197889, |
| "eval_loss": 1.7835866212844849, |
| "eval_runtime": 4.4222, |
| "eval_samples_per_second": 45.227, |
| "eval_steps_per_second": 11.307, |
| "step": 860 |
| }, |
| { |
| "epoch": 4.643799472295514, |
| "grad_norm": 15.8125, |
| "learning_rate": 1e-05, |
| "loss": 0.8055, |
| "step": 880 |
| }, |
| { |
| "epoch": 4.643799472295514, |
| "eval_loss": 1.776289939880371, |
| "eval_runtime": 4.451, |
| "eval_samples_per_second": 44.934, |
| "eval_steps_per_second": 11.234, |
| "step": 880 |
| }, |
| { |
| "epoch": 4.74934036939314, |
| "grad_norm": 16.125, |
| "learning_rate": 1e-05, |
| "loss": 0.8072, |
| "step": 900 |
| }, |
| { |
| "epoch": 4.74934036939314, |
| "eval_loss": 1.7724196910858154, |
| "eval_runtime": 4.6227, |
| "eval_samples_per_second": 43.264, |
| "eval_steps_per_second": 10.816, |
| "step": 900 |
| }, |
| { |
| "epoch": 4.854881266490765, |
| "grad_norm": 15.1875, |
| "learning_rate": 1e-05, |
| "loss": 0.8029, |
| "step": 920 |
| }, |
| { |
| "epoch": 4.854881266490765, |
| "eval_loss": 1.7451767921447754, |
| "eval_runtime": 4.5459, |
| "eval_samples_per_second": 43.995, |
| "eval_steps_per_second": 10.999, |
| "step": 920 |
| }, |
| { |
| "epoch": 4.96042216358839, |
| "grad_norm": 19.125, |
| "learning_rate": 1e-05, |
| "loss": 0.7929, |
| "step": 940 |
| }, |
| { |
| "epoch": 4.96042216358839, |
| "eval_loss": 1.719967246055603, |
| "eval_runtime": 4.7451, |
| "eval_samples_per_second": 42.148, |
| "eval_steps_per_second": 10.537, |
| "step": 940 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 9450, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 20, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 5, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.968166912425984e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|