diff --git "a/checkpoint-4210/trainer_state.json" "b/checkpoint-4210/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4210/trainer_state.json" @@ -0,0 +1,30176 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 50, + "global_step": 4210, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0023788284269997025, + "grad_norm": 1.04987633228302, + "learning_rate": 0.0, + "loss": 1.8792, + "step": 1 + }, + { + "epoch": 0.004757656853999405, + "grad_norm": 1.1409175395965576, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.1718, + "step": 2 + }, + { + "epoch": 0.007136485280999108, + "grad_norm": 1.1656651496887207, + "learning_rate": 4.000000000000001e-06, + "loss": 2.1308, + "step": 3 + }, + { + "epoch": 0.00951531370799881, + "grad_norm": 1.2673240900039673, + "learning_rate": 6e-06, + "loss": 2.2508, + "step": 4 + }, + { + "epoch": 0.011894142134998514, + "grad_norm": 1.1032017469406128, + "learning_rate": 8.000000000000001e-06, + "loss": 2.0553, + "step": 5 + }, + { + "epoch": 0.014272970561998216, + "grad_norm": 0.9813495874404907, + "learning_rate": 1e-05, + "loss": 2.0202, + "step": 6 + }, + { + "epoch": 0.016651798988997917, + "grad_norm": 1.0528686046600342, + "learning_rate": 9.997621878715815e-06, + "loss": 1.9175, + "step": 7 + }, + { + "epoch": 0.01903062741599762, + "grad_norm": 1.3686059713363647, + "learning_rate": 9.99524375743163e-06, + "loss": 2.32, + "step": 8 + }, + { + "epoch": 0.021409455842997322, + "grad_norm": 0.9712055325508118, + "learning_rate": 9.992865636147444e-06, + "loss": 1.9115, + "step": 9 + }, + { + "epoch": 0.023788284269997028, + "grad_norm": 1.227145791053772, + "learning_rate": 9.99048751486326e-06, + "loss": 2.2411, + "step": 10 + }, + { + "epoch": 0.02616711269699673, + "grad_norm": 1.2151082754135132, + "learning_rate": 9.988109393579073e-06, + "loss": 2.1864, + "step": 11 + }, + { + "epoch": 0.028545941123996433, + "grad_norm": 1.1602543592453003, + "learning_rate": 9.985731272294887e-06, + "loss": 2.109, + "step": 12 + }, + { + "epoch": 0.030924769550996135, + "grad_norm": 1.1971856355667114, + "learning_rate": 9.983353151010702e-06, + "loss": 2.0318, + "step": 13 + }, + { + "epoch": 0.033303597977995834, + "grad_norm": 1.1477810144424438, + "learning_rate": 9.980975029726518e-06, + "loss": 2.1216, + "step": 14 + }, + { + "epoch": 0.03568242640499554, + "grad_norm": 1.041809320449829, + "learning_rate": 9.978596908442331e-06, + "loss": 1.8973, + "step": 15 + }, + { + "epoch": 0.03806125483199524, + "grad_norm": 1.1508628129959106, + "learning_rate": 9.976218787158147e-06, + "loss": 1.927, + "step": 16 + }, + { + "epoch": 0.040440083258994945, + "grad_norm": 1.1186611652374268, + "learning_rate": 9.97384066587396e-06, + "loss": 2.0459, + "step": 17 + }, + { + "epoch": 0.042818911685994644, + "grad_norm": 1.035578727722168, + "learning_rate": 9.971462544589774e-06, + "loss": 1.7897, + "step": 18 + }, + { + "epoch": 0.04519774011299435, + "grad_norm": 1.0108476877212524, + "learning_rate": 9.96908442330559e-06, + "loss": 1.7568, + "step": 19 + }, + { + "epoch": 0.047576568539994056, + "grad_norm": 1.153806209564209, + "learning_rate": 9.966706302021404e-06, + "loss": 2.0361, + "step": 20 + }, + { + "epoch": 0.049955396966993755, + "grad_norm": 1.0752531290054321, + "learning_rate": 9.964328180737219e-06, + "loss": 1.9924, + "step": 21 + }, + { + "epoch": 0.05233422539399346, + "grad_norm": 1.0009926557540894, + "learning_rate": 9.961950059453033e-06, + "loss": 1.8876, + "step": 22 + }, + { + "epoch": 0.05471305382099316, + "grad_norm": 0.9451888203620911, + "learning_rate": 9.959571938168847e-06, + "loss": 1.7936, + "step": 23 + }, + { + "epoch": 0.057091882247992866, + "grad_norm": 1.0952671766281128, + "learning_rate": 9.957193816884662e-06, + "loss": 1.9022, + "step": 24 + }, + { + "epoch": 0.059470710674992565, + "grad_norm": 0.8680810332298279, + "learning_rate": 9.954815695600476e-06, + "loss": 1.6335, + "step": 25 + }, + { + "epoch": 0.06184953910199227, + "grad_norm": 1.0780009031295776, + "learning_rate": 9.95243757431629e-06, + "loss": 1.8751, + "step": 26 + }, + { + "epoch": 0.06422836752899197, + "grad_norm": 0.9501339793205261, + "learning_rate": 9.950059453032107e-06, + "loss": 1.6164, + "step": 27 + }, + { + "epoch": 0.06660719595599167, + "grad_norm": 0.9250319004058838, + "learning_rate": 9.94768133174792e-06, + "loss": 1.6453, + "step": 28 + }, + { + "epoch": 0.06898602438299138, + "grad_norm": 0.8910840153694153, + "learning_rate": 9.945303210463734e-06, + "loss": 1.5891, + "step": 29 + }, + { + "epoch": 0.07136485280999108, + "grad_norm": 0.849561333656311, + "learning_rate": 9.94292508917955e-06, + "loss": 1.4954, + "step": 30 + }, + { + "epoch": 0.07374368123699078, + "grad_norm": 0.8809520602226257, + "learning_rate": 9.940546967895364e-06, + "loss": 1.5457, + "step": 31 + }, + { + "epoch": 0.07612250966399048, + "grad_norm": 0.7879959940910339, + "learning_rate": 9.938168846611177e-06, + "loss": 1.5475, + "step": 32 + }, + { + "epoch": 0.07850133809099019, + "grad_norm": 0.8514576554298401, + "learning_rate": 9.935790725326993e-06, + "loss": 1.6687, + "step": 33 + }, + { + "epoch": 0.08088016651798989, + "grad_norm": 0.9125307202339172, + "learning_rate": 9.933412604042807e-06, + "loss": 1.6012, + "step": 34 + }, + { + "epoch": 0.08325899494498959, + "grad_norm": 0.7342825531959534, + "learning_rate": 9.931034482758622e-06, + "loss": 1.483, + "step": 35 + }, + { + "epoch": 0.08563782337198929, + "grad_norm": 0.9066355228424072, + "learning_rate": 9.928656361474436e-06, + "loss": 1.5848, + "step": 36 + }, + { + "epoch": 0.088016651798989, + "grad_norm": 0.7033758759498596, + "learning_rate": 9.92627824019025e-06, + "loss": 1.4012, + "step": 37 + }, + { + "epoch": 0.0903954802259887, + "grad_norm": 0.7228349447250366, + "learning_rate": 9.923900118906065e-06, + "loss": 1.4109, + "step": 38 + }, + { + "epoch": 0.0927743086529884, + "grad_norm": 0.8707212209701538, + "learning_rate": 9.921521997621879e-06, + "loss": 1.5935, + "step": 39 + }, + { + "epoch": 0.09515313707998811, + "grad_norm": 0.725263774394989, + "learning_rate": 9.919143876337694e-06, + "loss": 1.3621, + "step": 40 + }, + { + "epoch": 0.09753196550698781, + "grad_norm": 0.8578786253929138, + "learning_rate": 9.91676575505351e-06, + "loss": 1.6145, + "step": 41 + }, + { + "epoch": 0.09991079393398751, + "grad_norm": 0.7685489654541016, + "learning_rate": 9.914387633769323e-06, + "loss": 1.466, + "step": 42 + }, + { + "epoch": 0.10228962236098721, + "grad_norm": 0.6707257032394409, + "learning_rate": 9.912009512485137e-06, + "loss": 1.3976, + "step": 43 + }, + { + "epoch": 0.10466845078798692, + "grad_norm": 0.7726966142654419, + "learning_rate": 9.909631391200953e-06, + "loss": 1.381, + "step": 44 + }, + { + "epoch": 0.10704727921498662, + "grad_norm": 0.7161576151847839, + "learning_rate": 9.907253269916766e-06, + "loss": 1.3305, + "step": 45 + }, + { + "epoch": 0.10942610764198632, + "grad_norm": 0.7570360898971558, + "learning_rate": 9.904875148632582e-06, + "loss": 1.4697, + "step": 46 + }, + { + "epoch": 0.11180493606898602, + "grad_norm": 0.6843801736831665, + "learning_rate": 9.902497027348396e-06, + "loss": 1.3856, + "step": 47 + }, + { + "epoch": 0.11418376449598573, + "grad_norm": 0.682535707950592, + "learning_rate": 9.90011890606421e-06, + "loss": 1.3322, + "step": 48 + }, + { + "epoch": 0.11656259292298543, + "grad_norm": 0.7024103403091431, + "learning_rate": 9.897740784780025e-06, + "loss": 1.401, + "step": 49 + }, + { + "epoch": 0.11894142134998513, + "grad_norm": 0.6288143396377563, + "learning_rate": 9.895362663495839e-06, + "loss": 1.2401, + "step": 50 + }, + { + "epoch": 0.11894142134998513, + "eval_loss": 1.362038254737854, + "eval_runtime": 35.3896, + "eval_samples_per_second": 21.136, + "eval_steps_per_second": 10.568, + "step": 50 + }, + { + "epoch": 0.12132024977698483, + "grad_norm": 0.6913635730743408, + "learning_rate": 9.892984542211654e-06, + "loss": 1.3756, + "step": 51 + }, + { + "epoch": 0.12369907820398454, + "grad_norm": 0.6254145503044128, + "learning_rate": 9.890606420927468e-06, + "loss": 1.1993, + "step": 52 + }, + { + "epoch": 0.12607790663098423, + "grad_norm": 0.6881937384605408, + "learning_rate": 9.888228299643282e-06, + "loss": 1.2926, + "step": 53 + }, + { + "epoch": 0.12845673505798394, + "grad_norm": 0.7623552680015564, + "learning_rate": 9.885850178359097e-06, + "loss": 1.3869, + "step": 54 + }, + { + "epoch": 0.13083556348498365, + "grad_norm": 0.6980481147766113, + "learning_rate": 9.883472057074913e-06, + "loss": 1.2383, + "step": 55 + }, + { + "epoch": 0.13321439191198334, + "grad_norm": 0.6189219355583191, + "learning_rate": 9.881093935790726e-06, + "loss": 1.2787, + "step": 56 + }, + { + "epoch": 0.13559322033898305, + "grad_norm": 0.5519039034843445, + "learning_rate": 9.878715814506542e-06, + "loss": 1.1192, + "step": 57 + }, + { + "epoch": 0.13797204876598276, + "grad_norm": 0.6936759352684021, + "learning_rate": 9.876337693222356e-06, + "loss": 1.3044, + "step": 58 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 0.5263488292694092, + "learning_rate": 9.87395957193817e-06, + "loss": 1.0402, + "step": 59 + }, + { + "epoch": 0.14272970561998216, + "grad_norm": 0.6104781031608582, + "learning_rate": 9.871581450653985e-06, + "loss": 1.1769, + "step": 60 + }, + { + "epoch": 0.14510853404698187, + "grad_norm": 0.6430218815803528, + "learning_rate": 9.869203329369798e-06, + "loss": 1.2046, + "step": 61 + }, + { + "epoch": 0.14748736247398156, + "grad_norm": 0.6199575662612915, + "learning_rate": 9.866825208085612e-06, + "loss": 1.1345, + "step": 62 + }, + { + "epoch": 0.14986619090098127, + "grad_norm": 0.6026303768157959, + "learning_rate": 9.864447086801428e-06, + "loss": 1.1674, + "step": 63 + }, + { + "epoch": 0.15224501932798096, + "grad_norm": 0.6087438464164734, + "learning_rate": 9.862068965517241e-06, + "loss": 1.0922, + "step": 64 + }, + { + "epoch": 0.15462384775498067, + "grad_norm": 0.6703237891197205, + "learning_rate": 9.859690844233057e-06, + "loss": 1.2596, + "step": 65 + }, + { + "epoch": 0.15700267618198038, + "grad_norm": 0.63856041431427, + "learning_rate": 9.85731272294887e-06, + "loss": 1.1532, + "step": 66 + }, + { + "epoch": 0.15938150460898007, + "grad_norm": 0.6889049410820007, + "learning_rate": 9.854934601664684e-06, + "loss": 1.0993, + "step": 67 + }, + { + "epoch": 0.16176033303597978, + "grad_norm": 0.6619901657104492, + "learning_rate": 9.8525564803805e-06, + "loss": 1.0711, + "step": 68 + }, + { + "epoch": 0.1641391614629795, + "grad_norm": 0.6845427751541138, + "learning_rate": 9.850178359096315e-06, + "loss": 1.162, + "step": 69 + }, + { + "epoch": 0.16651798988997918, + "grad_norm": 0.5410576462745667, + "learning_rate": 9.847800237812129e-06, + "loss": 0.9658, + "step": 70 + }, + { + "epoch": 0.1688968183169789, + "grad_norm": 0.669938325881958, + "learning_rate": 9.845422116527945e-06, + "loss": 1.0857, + "step": 71 + }, + { + "epoch": 0.17127564674397858, + "grad_norm": 0.6215769648551941, + "learning_rate": 9.843043995243758e-06, + "loss": 1.0738, + "step": 72 + }, + { + "epoch": 0.1736544751709783, + "grad_norm": 0.6231212019920349, + "learning_rate": 9.840665873959572e-06, + "loss": 1.1192, + "step": 73 + }, + { + "epoch": 0.176033303597978, + "grad_norm": 0.6128190159797668, + "learning_rate": 9.838287752675388e-06, + "loss": 1.1306, + "step": 74 + }, + { + "epoch": 0.1784121320249777, + "grad_norm": 0.5584885478019714, + "learning_rate": 9.835909631391201e-06, + "loss": 0.9755, + "step": 75 + }, + { + "epoch": 0.1807909604519774, + "grad_norm": 0.5982023477554321, + "learning_rate": 9.833531510107017e-06, + "loss": 1.1342, + "step": 76 + }, + { + "epoch": 0.1831697888789771, + "grad_norm": 0.5359939336776733, + "learning_rate": 9.83115338882283e-06, + "loss": 0.9261, + "step": 77 + }, + { + "epoch": 0.1855486173059768, + "grad_norm": 0.6266713738441467, + "learning_rate": 9.828775267538644e-06, + "loss": 0.9262, + "step": 78 + }, + { + "epoch": 0.1879274457329765, + "grad_norm": 0.5741664171218872, + "learning_rate": 9.82639714625446e-06, + "loss": 0.9856, + "step": 79 + }, + { + "epoch": 0.19030627415997622, + "grad_norm": 0.6858999133110046, + "learning_rate": 9.824019024970274e-06, + "loss": 1.003, + "step": 80 + }, + { + "epoch": 0.1926851025869759, + "grad_norm": 0.534636378288269, + "learning_rate": 9.821640903686089e-06, + "loss": 0.9321, + "step": 81 + }, + { + "epoch": 0.19506393101397562, + "grad_norm": 0.7379254698753357, + "learning_rate": 9.819262782401904e-06, + "loss": 1.1168, + "step": 82 + }, + { + "epoch": 0.1974427594409753, + "grad_norm": 0.5949821472167969, + "learning_rate": 9.816884661117718e-06, + "loss": 0.9736, + "step": 83 + }, + { + "epoch": 0.19982158786797502, + "grad_norm": 0.6258150935173035, + "learning_rate": 9.814506539833532e-06, + "loss": 0.9219, + "step": 84 + }, + { + "epoch": 0.20220041629497473, + "grad_norm": 0.5986988544464111, + "learning_rate": 9.812128418549347e-06, + "loss": 0.9299, + "step": 85 + }, + { + "epoch": 0.20457924472197442, + "grad_norm": 0.5524076223373413, + "learning_rate": 9.809750297265161e-06, + "loss": 0.969, + "step": 86 + }, + { + "epoch": 0.20695807314897413, + "grad_norm": 0.5982860922813416, + "learning_rate": 9.807372175980977e-06, + "loss": 1.1513, + "step": 87 + }, + { + "epoch": 0.20933690157597384, + "grad_norm": 0.5900014042854309, + "learning_rate": 9.80499405469679e-06, + "loss": 0.9621, + "step": 88 + }, + { + "epoch": 0.21171573000297353, + "grad_norm": 0.6107502579689026, + "learning_rate": 9.802615933412604e-06, + "loss": 1.0545, + "step": 89 + }, + { + "epoch": 0.21409455842997324, + "grad_norm": 0.5245853662490845, + "learning_rate": 9.80023781212842e-06, + "loss": 0.7867, + "step": 90 + }, + { + "epoch": 0.21647338685697295, + "grad_norm": 0.5663040280342102, + "learning_rate": 9.797859690844233e-06, + "loss": 0.9152, + "step": 91 + }, + { + "epoch": 0.21885221528397264, + "grad_norm": 0.5418746471405029, + "learning_rate": 9.795481569560047e-06, + "loss": 0.9303, + "step": 92 + }, + { + "epoch": 0.22123104371097235, + "grad_norm": 0.616669237613678, + "learning_rate": 9.793103448275863e-06, + "loss": 0.905, + "step": 93 + }, + { + "epoch": 0.22360987213797204, + "grad_norm": 0.5245363712310791, + "learning_rate": 9.790725326991676e-06, + "loss": 0.8738, + "step": 94 + }, + { + "epoch": 0.22598870056497175, + "grad_norm": 0.5157999396324158, + "learning_rate": 9.788347205707492e-06, + "loss": 0.7949, + "step": 95 + }, + { + "epoch": 0.22836752899197146, + "grad_norm": 0.6164352297782898, + "learning_rate": 9.785969084423307e-06, + "loss": 0.84, + "step": 96 + }, + { + "epoch": 0.23074635741897115, + "grad_norm": 0.6234341263771057, + "learning_rate": 9.783590963139121e-06, + "loss": 0.9075, + "step": 97 + }, + { + "epoch": 0.23312518584597086, + "grad_norm": 0.5788864493370056, + "learning_rate": 9.781212841854935e-06, + "loss": 0.9117, + "step": 98 + }, + { + "epoch": 0.23550401427297057, + "grad_norm": 0.5705299377441406, + "learning_rate": 9.77883472057075e-06, + "loss": 0.9135, + "step": 99 + }, + { + "epoch": 0.23788284269997026, + "grad_norm": 0.5773945450782776, + "learning_rate": 9.776456599286564e-06, + "loss": 0.9695, + "step": 100 + }, + { + "epoch": 0.23788284269997026, + "eval_loss": 0.8687047362327576, + "eval_runtime": 23.6665, + "eval_samples_per_second": 31.606, + "eval_steps_per_second": 15.803, + "step": 100 + }, + { + "epoch": 0.24026167112696997, + "grad_norm": 0.5697104334831238, + "learning_rate": 9.77407847800238e-06, + "loss": 0.9242, + "step": 101 + }, + { + "epoch": 0.24264049955396966, + "grad_norm": 0.639143705368042, + "learning_rate": 9.771700356718193e-06, + "loss": 0.8591, + "step": 102 + }, + { + "epoch": 0.24501932798096937, + "grad_norm": 0.5930898189544678, + "learning_rate": 9.769322235434007e-06, + "loss": 0.8499, + "step": 103 + }, + { + "epoch": 0.24739815640796908, + "grad_norm": 0.6181649565696716, + "learning_rate": 9.766944114149823e-06, + "loss": 0.8784, + "step": 104 + }, + { + "epoch": 0.24977698483496877, + "grad_norm": 0.5637544989585876, + "learning_rate": 9.764565992865636e-06, + "loss": 0.888, + "step": 105 + }, + { + "epoch": 0.25215581326196845, + "grad_norm": 0.5277851223945618, + "learning_rate": 9.762187871581452e-06, + "loss": 0.8132, + "step": 106 + }, + { + "epoch": 0.25453464168896817, + "grad_norm": 0.535109281539917, + "learning_rate": 9.759809750297266e-06, + "loss": 0.8426, + "step": 107 + }, + { + "epoch": 0.2569134701159679, + "grad_norm": 0.4883708357810974, + "learning_rate": 9.75743162901308e-06, + "loss": 0.8307, + "step": 108 + }, + { + "epoch": 0.2592922985429676, + "grad_norm": 0.527755618095398, + "learning_rate": 9.755053507728895e-06, + "loss": 0.7757, + "step": 109 + }, + { + "epoch": 0.2616711269699673, + "grad_norm": 0.6098859906196594, + "learning_rate": 9.75267538644471e-06, + "loss": 0.9933, + "step": 110 + }, + { + "epoch": 0.264049955396967, + "grad_norm": 0.6365350484848022, + "learning_rate": 9.750297265160524e-06, + "loss": 0.8823, + "step": 111 + }, + { + "epoch": 0.2664287838239667, + "grad_norm": 0.5915812253952026, + "learning_rate": 9.74791914387634e-06, + "loss": 0.8676, + "step": 112 + }, + { + "epoch": 0.2688076122509664, + "grad_norm": 0.6166291236877441, + "learning_rate": 9.745541022592153e-06, + "loss": 0.7712, + "step": 113 + }, + { + "epoch": 0.2711864406779661, + "grad_norm": 0.48422136902809143, + "learning_rate": 9.743162901307967e-06, + "loss": 0.6927, + "step": 114 + }, + { + "epoch": 0.2735652691049658, + "grad_norm": 0.5516798496246338, + "learning_rate": 9.740784780023782e-06, + "loss": 0.8027, + "step": 115 + }, + { + "epoch": 0.2759440975319655, + "grad_norm": 0.5779426097869873, + "learning_rate": 9.738406658739596e-06, + "loss": 0.7099, + "step": 116 + }, + { + "epoch": 0.2783229259589652, + "grad_norm": 0.6701722145080566, + "learning_rate": 9.736028537455412e-06, + "loss": 0.7408, + "step": 117 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 0.5915306806564331, + "learning_rate": 9.733650416171225e-06, + "loss": 0.7656, + "step": 118 + }, + { + "epoch": 0.2830805828129646, + "grad_norm": 0.5316159129142761, + "learning_rate": 9.731272294887039e-06, + "loss": 0.7779, + "step": 119 + }, + { + "epoch": 0.2854594112399643, + "grad_norm": 0.6183955073356628, + "learning_rate": 9.728894173602855e-06, + "loss": 0.8033, + "step": 120 + }, + { + "epoch": 0.28783823966696404, + "grad_norm": 0.5588330626487732, + "learning_rate": 9.726516052318668e-06, + "loss": 0.691, + "step": 121 + }, + { + "epoch": 0.29021706809396375, + "grad_norm": 0.555540144443512, + "learning_rate": 9.724137931034484e-06, + "loss": 0.7472, + "step": 122 + }, + { + "epoch": 0.2925958965209634, + "grad_norm": 0.4914023280143738, + "learning_rate": 9.7217598097503e-06, + "loss": 0.6662, + "step": 123 + }, + { + "epoch": 0.2949747249479631, + "grad_norm": 0.6081638932228088, + "learning_rate": 9.719381688466113e-06, + "loss": 0.7472, + "step": 124 + }, + { + "epoch": 0.29735355337496283, + "grad_norm": 0.5591022372245789, + "learning_rate": 9.717003567181927e-06, + "loss": 0.7947, + "step": 125 + }, + { + "epoch": 0.29973238180196254, + "grad_norm": 0.5352297425270081, + "learning_rate": 9.714625445897742e-06, + "loss": 0.7374, + "step": 126 + }, + { + "epoch": 0.30211121022896226, + "grad_norm": 0.571910560131073, + "learning_rate": 9.712247324613556e-06, + "loss": 0.7188, + "step": 127 + }, + { + "epoch": 0.3044900386559619, + "grad_norm": 0.5215409398078918, + "learning_rate": 9.709869203329371e-06, + "loss": 0.6405, + "step": 128 + }, + { + "epoch": 0.3068688670829616, + "grad_norm": 0.5397109389305115, + "learning_rate": 9.707491082045185e-06, + "loss": 0.8183, + "step": 129 + }, + { + "epoch": 0.30924769550996134, + "grad_norm": 0.4725683629512787, + "learning_rate": 9.705112960760999e-06, + "loss": 0.6222, + "step": 130 + }, + { + "epoch": 0.31162652393696105, + "grad_norm": 0.6211969256401062, + "learning_rate": 9.702734839476814e-06, + "loss": 0.7078, + "step": 131 + }, + { + "epoch": 0.31400535236396077, + "grad_norm": 0.6103003621101379, + "learning_rate": 9.700356718192628e-06, + "loss": 0.8314, + "step": 132 + }, + { + "epoch": 0.3163841807909605, + "grad_norm": 0.6851162314414978, + "learning_rate": 9.697978596908442e-06, + "loss": 0.7422, + "step": 133 + }, + { + "epoch": 0.31876300921796014, + "grad_norm": 0.5476438999176025, + "learning_rate": 9.695600475624257e-06, + "loss": 0.7245, + "step": 134 + }, + { + "epoch": 0.32114183764495985, + "grad_norm": 0.5277385115623474, + "learning_rate": 9.693222354340071e-06, + "loss": 0.6149, + "step": 135 + }, + { + "epoch": 0.32352066607195956, + "grad_norm": 0.5545679330825806, + "learning_rate": 9.690844233055887e-06, + "loss": 0.6269, + "step": 136 + }, + { + "epoch": 0.3258994944989593, + "grad_norm": 0.5140418410301208, + "learning_rate": 9.688466111771702e-06, + "loss": 0.607, + "step": 137 + }, + { + "epoch": 0.328278322925959, + "grad_norm": 0.6389631628990173, + "learning_rate": 9.686087990487516e-06, + "loss": 0.7011, + "step": 138 + }, + { + "epoch": 0.33065715135295864, + "grad_norm": 0.5569331645965576, + "learning_rate": 9.68370986920333e-06, + "loss": 0.6583, + "step": 139 + }, + { + "epoch": 0.33303597977995836, + "grad_norm": 0.5677202343940735, + "learning_rate": 9.681331747919145e-06, + "loss": 0.7502, + "step": 140 + }, + { + "epoch": 0.33541480820695807, + "grad_norm": 0.5870410203933716, + "learning_rate": 9.678953626634959e-06, + "loss": 0.6787, + "step": 141 + }, + { + "epoch": 0.3377936366339578, + "grad_norm": 0.7217148542404175, + "learning_rate": 9.676575505350774e-06, + "loss": 0.6702, + "step": 142 + }, + { + "epoch": 0.3401724650609575, + "grad_norm": 0.5037275552749634, + "learning_rate": 9.674197384066588e-06, + "loss": 0.6527, + "step": 143 + }, + { + "epoch": 0.34255129348795715, + "grad_norm": 0.56263267993927, + "learning_rate": 9.671819262782402e-06, + "loss": 0.654, + "step": 144 + }, + { + "epoch": 0.34493012191495687, + "grad_norm": 0.5600253939628601, + "learning_rate": 9.669441141498217e-06, + "loss": 0.7465, + "step": 145 + }, + { + "epoch": 0.3473089503419566, + "grad_norm": 0.48693767189979553, + "learning_rate": 9.667063020214031e-06, + "loss": 0.6225, + "step": 146 + }, + { + "epoch": 0.3496877787689563, + "grad_norm": 0.6318839192390442, + "learning_rate": 9.664684898929847e-06, + "loss": 0.6725, + "step": 147 + }, + { + "epoch": 0.352066607195956, + "grad_norm": 0.5716957449913025, + "learning_rate": 9.66230677764566e-06, + "loss": 0.6862, + "step": 148 + }, + { + "epoch": 0.3544454356229557, + "grad_norm": 0.45795875787734985, + "learning_rate": 9.659928656361474e-06, + "loss": 0.6283, + "step": 149 + }, + { + "epoch": 0.3568242640499554, + "grad_norm": 0.5095898509025574, + "learning_rate": 9.65755053507729e-06, + "loss": 0.6331, + "step": 150 + }, + { + "epoch": 0.3568242640499554, + "eval_loss": 0.6608059406280518, + "eval_runtime": 23.6113, + "eval_samples_per_second": 31.68, + "eval_steps_per_second": 15.84, + "step": 150 + }, + { + "epoch": 0.3592030924769551, + "grad_norm": 0.6328606009483337, + "learning_rate": 9.655172413793105e-06, + "loss": 0.6923, + "step": 151 + }, + { + "epoch": 0.3615819209039548, + "grad_norm": 0.54820716381073, + "learning_rate": 9.652794292508919e-06, + "loss": 0.6859, + "step": 152 + }, + { + "epoch": 0.3639607493309545, + "grad_norm": 0.5808321237564087, + "learning_rate": 9.650416171224734e-06, + "loss": 0.6928, + "step": 153 + }, + { + "epoch": 0.3663395777579542, + "grad_norm": 0.46660491824150085, + "learning_rate": 9.648038049940548e-06, + "loss": 0.5112, + "step": 154 + }, + { + "epoch": 0.3687184061849539, + "grad_norm": 0.5158565044403076, + "learning_rate": 9.645659928656362e-06, + "loss": 0.62, + "step": 155 + }, + { + "epoch": 0.3710972346119536, + "grad_norm": 0.5014441013336182, + "learning_rate": 9.643281807372177e-06, + "loss": 0.6136, + "step": 156 + }, + { + "epoch": 0.3734760630389533, + "grad_norm": 0.5588313341140747, + "learning_rate": 9.640903686087991e-06, + "loss": 0.5419, + "step": 157 + }, + { + "epoch": 0.375854891465953, + "grad_norm": 0.5283618569374084, + "learning_rate": 9.638525564803806e-06, + "loss": 0.5882, + "step": 158 + }, + { + "epoch": 0.37823371989295274, + "grad_norm": 0.5285916328430176, + "learning_rate": 9.63614744351962e-06, + "loss": 0.5256, + "step": 159 + }, + { + "epoch": 0.38061254831995245, + "grad_norm": 0.6327935457229614, + "learning_rate": 9.633769322235434e-06, + "loss": 0.7705, + "step": 160 + }, + { + "epoch": 0.3829913767469521, + "grad_norm": 0.4829707443714142, + "learning_rate": 9.63139120095125e-06, + "loss": 0.6309, + "step": 161 + }, + { + "epoch": 0.3853702051739518, + "grad_norm": 0.4694628417491913, + "learning_rate": 9.629013079667063e-06, + "loss": 0.6241, + "step": 162 + }, + { + "epoch": 0.38774903360095153, + "grad_norm": 0.5326478481292725, + "learning_rate": 9.626634958382877e-06, + "loss": 0.603, + "step": 163 + }, + { + "epoch": 0.39012786202795124, + "grad_norm": 0.5992695093154907, + "learning_rate": 9.624256837098694e-06, + "loss": 0.693, + "step": 164 + }, + { + "epoch": 0.39250669045495096, + "grad_norm": 0.5451353192329407, + "learning_rate": 9.621878715814508e-06, + "loss": 0.6352, + "step": 165 + }, + { + "epoch": 0.3948855188819506, + "grad_norm": 0.5283603072166443, + "learning_rate": 9.619500594530322e-06, + "loss": 0.5754, + "step": 166 + }, + { + "epoch": 0.3972643473089503, + "grad_norm": 0.4931126534938812, + "learning_rate": 9.617122473246137e-06, + "loss": 0.6412, + "step": 167 + }, + { + "epoch": 0.39964317573595004, + "grad_norm": 0.6089839935302734, + "learning_rate": 9.61474435196195e-06, + "loss": 0.691, + "step": 168 + }, + { + "epoch": 0.40202200416294975, + "grad_norm": 0.5084049105644226, + "learning_rate": 9.612366230677765e-06, + "loss": 0.5761, + "step": 169 + }, + { + "epoch": 0.40440083258994947, + "grad_norm": 0.5322434306144714, + "learning_rate": 9.60998810939358e-06, + "loss": 0.6782, + "step": 170 + }, + { + "epoch": 0.4067796610169492, + "grad_norm": 0.5002917647361755, + "learning_rate": 9.607609988109394e-06, + "loss": 0.5552, + "step": 171 + }, + { + "epoch": 0.40915848944394884, + "grad_norm": 0.5481022000312805, + "learning_rate": 9.60523186682521e-06, + "loss": 0.6888, + "step": 172 + }, + { + "epoch": 0.41153731787094855, + "grad_norm": 0.46157607436180115, + "learning_rate": 9.602853745541023e-06, + "loss": 0.5377, + "step": 173 + }, + { + "epoch": 0.41391614629794826, + "grad_norm": 0.5205973386764526, + "learning_rate": 9.600475624256837e-06, + "loss": 0.6141, + "step": 174 + }, + { + "epoch": 0.416294974724948, + "grad_norm": 0.5071077346801758, + "learning_rate": 9.598097502972652e-06, + "loss": 0.5268, + "step": 175 + }, + { + "epoch": 0.4186738031519477, + "grad_norm": 0.49213868379592896, + "learning_rate": 9.595719381688466e-06, + "loss": 0.6166, + "step": 176 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.47055673599243164, + "learning_rate": 9.593341260404281e-06, + "loss": 0.5749, + "step": 177 + }, + { + "epoch": 0.42343146000594706, + "grad_norm": 0.49697867035865784, + "learning_rate": 9.590963139120097e-06, + "loss": 0.5576, + "step": 178 + }, + { + "epoch": 0.42581028843294677, + "grad_norm": 0.462236613035202, + "learning_rate": 9.58858501783591e-06, + "loss": 0.5861, + "step": 179 + }, + { + "epoch": 0.4281891168599465, + "grad_norm": 0.4972895085811615, + "learning_rate": 9.586206896551724e-06, + "loss": 0.4986, + "step": 180 + }, + { + "epoch": 0.4305679452869462, + "grad_norm": 0.5541361570358276, + "learning_rate": 9.58382877526754e-06, + "loss": 0.6296, + "step": 181 + }, + { + "epoch": 0.4329467737139459, + "grad_norm": 0.481115460395813, + "learning_rate": 9.581450653983354e-06, + "loss": 0.5875, + "step": 182 + }, + { + "epoch": 0.43532560214094557, + "grad_norm": 0.5035417675971985, + "learning_rate": 9.57907253269917e-06, + "loss": 0.5387, + "step": 183 + }, + { + "epoch": 0.4377044305679453, + "grad_norm": 0.49712780117988586, + "learning_rate": 9.576694411414983e-06, + "loss": 0.624, + "step": 184 + }, + { + "epoch": 0.440083258994945, + "grad_norm": 0.49449872970581055, + "learning_rate": 9.574316290130797e-06, + "loss": 0.5284, + "step": 185 + }, + { + "epoch": 0.4424620874219447, + "grad_norm": 0.5145215392112732, + "learning_rate": 9.571938168846612e-06, + "loss": 0.601, + "step": 186 + }, + { + "epoch": 0.4448409158489444, + "grad_norm": 0.5811765789985657, + "learning_rate": 9.569560047562426e-06, + "loss": 0.6505, + "step": 187 + }, + { + "epoch": 0.4472197442759441, + "grad_norm": 0.49338001012802124, + "learning_rate": 9.567181926278241e-06, + "loss": 0.5801, + "step": 188 + }, + { + "epoch": 0.4495985727029438, + "grad_norm": 0.5011583566665649, + "learning_rate": 9.564803804994055e-06, + "loss": 0.7297, + "step": 189 + }, + { + "epoch": 0.4519774011299435, + "grad_norm": 0.48783889412879944, + "learning_rate": 9.562425683709869e-06, + "loss": 0.5238, + "step": 190 + }, + { + "epoch": 0.4543562295569432, + "grad_norm": 0.485562801361084, + "learning_rate": 9.560047562425684e-06, + "loss": 0.6039, + "step": 191 + }, + { + "epoch": 0.4567350579839429, + "grad_norm": 0.5441937446594238, + "learning_rate": 9.5576694411415e-06, + "loss": 0.5661, + "step": 192 + }, + { + "epoch": 0.4591138864109426, + "grad_norm": 0.511450469493866, + "learning_rate": 9.555291319857314e-06, + "loss": 0.6305, + "step": 193 + }, + { + "epoch": 0.4614927148379423, + "grad_norm": 0.4986901879310608, + "learning_rate": 9.552913198573129e-06, + "loss": 0.6104, + "step": 194 + }, + { + "epoch": 0.463871543264942, + "grad_norm": 0.5372093915939331, + "learning_rate": 9.550535077288943e-06, + "loss": 0.5552, + "step": 195 + }, + { + "epoch": 0.4662503716919417, + "grad_norm": 0.4844958484172821, + "learning_rate": 9.548156956004757e-06, + "loss": 0.6061, + "step": 196 + }, + { + "epoch": 0.46862920011894144, + "grad_norm": 0.42187559604644775, + "learning_rate": 9.545778834720572e-06, + "loss": 0.5339, + "step": 197 + }, + { + "epoch": 0.47100802854594115, + "grad_norm": 0.4600416421890259, + "learning_rate": 9.543400713436386e-06, + "loss": 0.5396, + "step": 198 + }, + { + "epoch": 0.4733868569729408, + "grad_norm": 0.5156920552253723, + "learning_rate": 9.541022592152201e-06, + "loss": 0.642, + "step": 199 + }, + { + "epoch": 0.4757656853999405, + "grad_norm": 0.5513865351676941, + "learning_rate": 9.538644470868015e-06, + "loss": 0.5465, + "step": 200 + }, + { + "epoch": 0.4757656853999405, + "eval_loss": 0.6055951714515686, + "eval_runtime": 23.5925, + "eval_samples_per_second": 31.705, + "eval_steps_per_second": 15.852, + "step": 200 + }, + { + "epoch": 0.47814451382694023, + "grad_norm": 0.5349980592727661, + "learning_rate": 9.536266349583829e-06, + "loss": 0.6342, + "step": 201 + }, + { + "epoch": 0.48052334225393994, + "grad_norm": 0.5738868713378906, + "learning_rate": 9.533888228299644e-06, + "loss": 0.5424, + "step": 202 + }, + { + "epoch": 0.48290217068093966, + "grad_norm": 0.5268381237983704, + "learning_rate": 9.531510107015458e-06, + "loss": 0.579, + "step": 203 + }, + { + "epoch": 0.4852809991079393, + "grad_norm": 0.5708011984825134, + "learning_rate": 9.529131985731273e-06, + "loss": 0.6002, + "step": 204 + }, + { + "epoch": 0.487659827534939, + "grad_norm": 0.46687522530555725, + "learning_rate": 9.526753864447087e-06, + "loss": 0.5872, + "step": 205 + }, + { + "epoch": 0.49003865596193874, + "grad_norm": 0.560222864151001, + "learning_rate": 9.524375743162903e-06, + "loss": 0.669, + "step": 206 + }, + { + "epoch": 0.49241748438893845, + "grad_norm": 0.5192834734916687, + "learning_rate": 9.521997621878716e-06, + "loss": 0.6161, + "step": 207 + }, + { + "epoch": 0.49479631281593817, + "grad_norm": 0.5305255651473999, + "learning_rate": 9.519619500594532e-06, + "loss": 0.5761, + "step": 208 + }, + { + "epoch": 0.4971751412429379, + "grad_norm": 0.47425130009651184, + "learning_rate": 9.517241379310346e-06, + "loss": 0.5623, + "step": 209 + }, + { + "epoch": 0.49955396966993754, + "grad_norm": 0.5019915699958801, + "learning_rate": 9.51486325802616e-06, + "loss": 0.5027, + "step": 210 + }, + { + "epoch": 0.5019327980969372, + "grad_norm": 0.5448631048202515, + "learning_rate": 9.512485136741975e-06, + "loss": 0.6247, + "step": 211 + }, + { + "epoch": 0.5043116265239369, + "grad_norm": 0.5231260657310486, + "learning_rate": 9.510107015457789e-06, + "loss": 0.5067, + "step": 212 + }, + { + "epoch": 0.5066904549509367, + "grad_norm": 0.5327592492103577, + "learning_rate": 9.507728894173604e-06, + "loss": 0.5747, + "step": 213 + }, + { + "epoch": 0.5090692833779363, + "grad_norm": 0.5464369654655457, + "learning_rate": 9.505350772889418e-06, + "loss": 0.5892, + "step": 214 + }, + { + "epoch": 0.5114481118049361, + "grad_norm": 0.5470043420791626, + "learning_rate": 9.502972651605232e-06, + "loss": 0.6653, + "step": 215 + }, + { + "epoch": 0.5138269402319358, + "grad_norm": 0.5360198616981506, + "learning_rate": 9.500594530321047e-06, + "loss": 0.5779, + "step": 216 + }, + { + "epoch": 0.5162057686589355, + "grad_norm": 0.461517870426178, + "learning_rate": 9.49821640903686e-06, + "loss": 0.4899, + "step": 217 + }, + { + "epoch": 0.5185845970859352, + "grad_norm": 0.6309890747070312, + "learning_rate": 9.495838287752676e-06, + "loss": 0.7225, + "step": 218 + }, + { + "epoch": 0.5209634255129348, + "grad_norm": 0.6221989393234253, + "learning_rate": 9.493460166468492e-06, + "loss": 0.6706, + "step": 219 + }, + { + "epoch": 0.5233422539399346, + "grad_norm": 0.48245179653167725, + "learning_rate": 9.491082045184306e-06, + "loss": 0.4771, + "step": 220 + }, + { + "epoch": 0.5257210823669343, + "grad_norm": 0.5116746425628662, + "learning_rate": 9.48870392390012e-06, + "loss": 0.5826, + "step": 221 + }, + { + "epoch": 0.528099910793934, + "grad_norm": 0.5033395290374756, + "learning_rate": 9.486325802615935e-06, + "loss": 0.5587, + "step": 222 + }, + { + "epoch": 0.5304787392209337, + "grad_norm": 0.55325847864151, + "learning_rate": 9.483947681331749e-06, + "loss": 0.5529, + "step": 223 + }, + { + "epoch": 0.5328575676479334, + "grad_norm": 0.5221287608146667, + "learning_rate": 9.481569560047564e-06, + "loss": 0.529, + "step": 224 + }, + { + "epoch": 0.5352363960749331, + "grad_norm": 0.5181965231895447, + "learning_rate": 9.479191438763378e-06, + "loss": 0.6871, + "step": 225 + }, + { + "epoch": 0.5376152245019328, + "grad_norm": 0.5322456955909729, + "learning_rate": 9.476813317479191e-06, + "loss": 0.6405, + "step": 226 + }, + { + "epoch": 0.5399940529289325, + "grad_norm": 0.5056934356689453, + "learning_rate": 9.474435196195007e-06, + "loss": 0.553, + "step": 227 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 0.602809488773346, + "learning_rate": 9.47205707491082e-06, + "loss": 0.574, + "step": 228 + }, + { + "epoch": 0.5447517097829319, + "grad_norm": 0.5264748334884644, + "learning_rate": 9.469678953626636e-06, + "loss": 0.6317, + "step": 229 + }, + { + "epoch": 0.5471305382099316, + "grad_norm": 0.555245041847229, + "learning_rate": 9.46730083234245e-06, + "loss": 0.5501, + "step": 230 + }, + { + "epoch": 0.5495093666369313, + "grad_norm": 0.5024836659431458, + "learning_rate": 9.464922711058264e-06, + "loss": 0.5364, + "step": 231 + }, + { + "epoch": 0.551888195063931, + "grad_norm": 0.5490068197250366, + "learning_rate": 9.46254458977408e-06, + "loss": 0.528, + "step": 232 + }, + { + "epoch": 0.5542670234909307, + "grad_norm": 0.5184789896011353, + "learning_rate": 9.460166468489895e-06, + "loss": 0.5785, + "step": 233 + }, + { + "epoch": 0.5566458519179304, + "grad_norm": 0.5771692395210266, + "learning_rate": 9.457788347205708e-06, + "loss": 0.589, + "step": 234 + }, + { + "epoch": 0.5590246803449301, + "grad_norm": 0.4899429380893707, + "learning_rate": 9.455410225921524e-06, + "loss": 0.5568, + "step": 235 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 0.5054925680160522, + "learning_rate": 9.453032104637338e-06, + "loss": 0.5931, + "step": 236 + }, + { + "epoch": 0.5637823371989296, + "grad_norm": 0.574116051197052, + "learning_rate": 9.450653983353151e-06, + "loss": 0.6157, + "step": 237 + }, + { + "epoch": 0.5661611656259292, + "grad_norm": 0.487914115190506, + "learning_rate": 9.448275862068967e-06, + "loss": 0.5302, + "step": 238 + }, + { + "epoch": 0.568539994052929, + "grad_norm": 0.4909398853778839, + "learning_rate": 9.44589774078478e-06, + "loss": 0.5296, + "step": 239 + }, + { + "epoch": 0.5709188224799286, + "grad_norm": 0.5328031182289124, + "learning_rate": 9.443519619500594e-06, + "loss": 0.5561, + "step": 240 + }, + { + "epoch": 0.5732976509069283, + "grad_norm": 0.4987448751926422, + "learning_rate": 9.44114149821641e-06, + "loss": 0.6209, + "step": 241 + }, + { + "epoch": 0.5756764793339281, + "grad_norm": 0.5285313725471497, + "learning_rate": 9.438763376932224e-06, + "loss": 0.4916, + "step": 242 + }, + { + "epoch": 0.5780553077609277, + "grad_norm": 0.5365431904792786, + "learning_rate": 9.436385255648039e-06, + "loss": 0.5025, + "step": 243 + }, + { + "epoch": 0.5804341361879275, + "grad_norm": 0.5295987725257874, + "learning_rate": 9.434007134363853e-06, + "loss": 0.5796, + "step": 244 + }, + { + "epoch": 0.5828129646149272, + "grad_norm": 0.4700901508331299, + "learning_rate": 9.431629013079668e-06, + "loss": 0.5844, + "step": 245 + }, + { + "epoch": 0.5851917930419268, + "grad_norm": 0.5074923634529114, + "learning_rate": 9.429250891795482e-06, + "loss": 0.6082, + "step": 246 + }, + { + "epoch": 0.5875706214689266, + "grad_norm": 0.5145196914672852, + "learning_rate": 9.426872770511297e-06, + "loss": 0.5482, + "step": 247 + }, + { + "epoch": 0.5899494498959262, + "grad_norm": 0.5206637382507324, + "learning_rate": 9.424494649227111e-06, + "loss": 0.4899, + "step": 248 + }, + { + "epoch": 0.592328278322926, + "grad_norm": 0.46147269010543823, + "learning_rate": 9.422116527942927e-06, + "loss": 0.5471, + "step": 249 + }, + { + "epoch": 0.5947071067499257, + "grad_norm": 0.5611085891723633, + "learning_rate": 9.41973840665874e-06, + "loss": 0.5544, + "step": 250 + }, + { + "epoch": 0.5947071067499257, + "eval_loss": 0.5782613754272461, + "eval_runtime": 23.1327, + "eval_samples_per_second": 32.335, + "eval_steps_per_second": 16.168, + "step": 250 + }, + { + "epoch": 0.5970859351769253, + "grad_norm": 0.4951978027820587, + "learning_rate": 9.417360285374554e-06, + "loss": 0.5867, + "step": 251 + }, + { + "epoch": 0.5994647636039251, + "grad_norm": 0.6002156734466553, + "learning_rate": 9.41498216409037e-06, + "loss": 0.664, + "step": 252 + }, + { + "epoch": 0.6018435920309247, + "grad_norm": 0.5500977039337158, + "learning_rate": 9.412604042806183e-06, + "loss": 0.5535, + "step": 253 + }, + { + "epoch": 0.6042224204579245, + "grad_norm": 0.5385454297065735, + "learning_rate": 9.410225921521999e-06, + "loss": 0.5751, + "step": 254 + }, + { + "epoch": 0.6066012488849242, + "grad_norm": 0.6178719401359558, + "learning_rate": 9.407847800237813e-06, + "loss": 0.5505, + "step": 255 + }, + { + "epoch": 0.6089800773119238, + "grad_norm": 0.5742382407188416, + "learning_rate": 9.405469678953626e-06, + "loss": 0.5763, + "step": 256 + }, + { + "epoch": 0.6113589057389236, + "grad_norm": 0.5045819282531738, + "learning_rate": 9.403091557669442e-06, + "loss": 0.5307, + "step": 257 + }, + { + "epoch": 0.6137377341659233, + "grad_norm": 0.5397653579711914, + "learning_rate": 9.400713436385256e-06, + "loss": 0.5871, + "step": 258 + }, + { + "epoch": 0.616116562592923, + "grad_norm": 0.5844741463661194, + "learning_rate": 9.398335315101071e-06, + "loss": 0.5684, + "step": 259 + }, + { + "epoch": 0.6184953910199227, + "grad_norm": 0.4845225512981415, + "learning_rate": 9.395957193816887e-06, + "loss": 0.5341, + "step": 260 + }, + { + "epoch": 0.6208742194469223, + "grad_norm": 0.5233320593833923, + "learning_rate": 9.3935790725327e-06, + "loss": 0.5623, + "step": 261 + }, + { + "epoch": 0.6232530478739221, + "grad_norm": 0.5731892585754395, + "learning_rate": 9.391200951248514e-06, + "loss": 0.5683, + "step": 262 + }, + { + "epoch": 0.6256318763009218, + "grad_norm": 0.539307713508606, + "learning_rate": 9.38882282996433e-06, + "loss": 0.5541, + "step": 263 + }, + { + "epoch": 0.6280107047279215, + "grad_norm": 0.4763849377632141, + "learning_rate": 9.386444708680143e-06, + "loss": 0.553, + "step": 264 + }, + { + "epoch": 0.6303895331549212, + "grad_norm": 0.5950015783309937, + "learning_rate": 9.384066587395959e-06, + "loss": 0.5716, + "step": 265 + }, + { + "epoch": 0.632768361581921, + "grad_norm": 0.6006914377212524, + "learning_rate": 9.381688466111773e-06, + "loss": 0.6107, + "step": 266 + }, + { + "epoch": 0.6351471900089206, + "grad_norm": 0.5577612519264221, + "learning_rate": 9.379310344827586e-06, + "loss": 0.538, + "step": 267 + }, + { + "epoch": 0.6375260184359203, + "grad_norm": 0.5474477410316467, + "learning_rate": 9.376932223543402e-06, + "loss": 0.5852, + "step": 268 + }, + { + "epoch": 0.63990484686292, + "grad_norm": 0.5812591314315796, + "learning_rate": 9.374554102259216e-06, + "loss": 0.626, + "step": 269 + }, + { + "epoch": 0.6422836752899197, + "grad_norm": 0.5159488916397095, + "learning_rate": 9.37217598097503e-06, + "loss": 0.575, + "step": 270 + }, + { + "epoch": 0.6446625037169195, + "grad_norm": 0.5275406241416931, + "learning_rate": 9.369797859690845e-06, + "loss": 0.5463, + "step": 271 + }, + { + "epoch": 0.6470413321439191, + "grad_norm": 0.5831428170204163, + "learning_rate": 9.367419738406659e-06, + "loss": 0.6201, + "step": 272 + }, + { + "epoch": 0.6494201605709188, + "grad_norm": 0.4947662055492401, + "learning_rate": 9.365041617122474e-06, + "loss": 0.5697, + "step": 273 + }, + { + "epoch": 0.6517989889979185, + "grad_norm": 0.4913242757320404, + "learning_rate": 9.36266349583829e-06, + "loss": 0.5887, + "step": 274 + }, + { + "epoch": 0.6541778174249182, + "grad_norm": 0.5345326066017151, + "learning_rate": 9.360285374554103e-06, + "loss": 0.5077, + "step": 275 + }, + { + "epoch": 0.656556645851918, + "grad_norm": 0.4429227411746979, + "learning_rate": 9.357907253269917e-06, + "loss": 0.4719, + "step": 276 + }, + { + "epoch": 0.6589354742789176, + "grad_norm": 0.4688141644001007, + "learning_rate": 9.355529131985732e-06, + "loss": 0.5008, + "step": 277 + }, + { + "epoch": 0.6613143027059173, + "grad_norm": 0.4880606234073639, + "learning_rate": 9.353151010701546e-06, + "loss": 0.5261, + "step": 278 + }, + { + "epoch": 0.6636931311329171, + "grad_norm": 0.5775059461593628, + "learning_rate": 9.350772889417362e-06, + "loss": 0.6112, + "step": 279 + }, + { + "epoch": 0.6660719595599167, + "grad_norm": 0.484942764043808, + "learning_rate": 9.348394768133175e-06, + "loss": 0.4746, + "step": 280 + }, + { + "epoch": 0.6684507879869165, + "grad_norm": 0.56173175573349, + "learning_rate": 9.34601664684899e-06, + "loss": 0.5809, + "step": 281 + }, + { + "epoch": 0.6708296164139161, + "grad_norm": 0.5112358927726746, + "learning_rate": 9.343638525564805e-06, + "loss": 0.5823, + "step": 282 + }, + { + "epoch": 0.6732084448409158, + "grad_norm": 0.5745301842689514, + "learning_rate": 9.341260404280618e-06, + "loss": 0.5569, + "step": 283 + }, + { + "epoch": 0.6755872732679156, + "grad_norm": 0.5402821898460388, + "learning_rate": 9.338882282996434e-06, + "loss": 0.5094, + "step": 284 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 0.5265027284622192, + "learning_rate": 9.336504161712248e-06, + "loss": 0.6429, + "step": 285 + }, + { + "epoch": 0.680344930121915, + "grad_norm": 0.5121905207633972, + "learning_rate": 9.334126040428063e-06, + "loss": 0.5191, + "step": 286 + }, + { + "epoch": 0.6827237585489146, + "grad_norm": 0.5409682393074036, + "learning_rate": 9.331747919143877e-06, + "loss": 0.535, + "step": 287 + }, + { + "epoch": 0.6851025869759143, + "grad_norm": 0.5377373099327087, + "learning_rate": 9.329369797859692e-06, + "loss": 0.6208, + "step": 288 + }, + { + "epoch": 0.6874814154029141, + "grad_norm": 0.5877012014389038, + "learning_rate": 9.326991676575506e-06, + "loss": 0.6442, + "step": 289 + }, + { + "epoch": 0.6898602438299137, + "grad_norm": 0.4919497072696686, + "learning_rate": 9.324613555291322e-06, + "loss": 0.4625, + "step": 290 + }, + { + "epoch": 0.6922390722569135, + "grad_norm": 0.5160561800003052, + "learning_rate": 9.322235434007135e-06, + "loss": 0.4601, + "step": 291 + }, + { + "epoch": 0.6946179006839132, + "grad_norm": 0.4627416431903839, + "learning_rate": 9.319857312722949e-06, + "loss": 0.5667, + "step": 292 + }, + { + "epoch": 0.6969967291109129, + "grad_norm": 0.6004971861839294, + "learning_rate": 9.317479191438764e-06, + "loss": 0.5834, + "step": 293 + }, + { + "epoch": 0.6993755575379126, + "grad_norm": 0.5150706171989441, + "learning_rate": 9.315101070154578e-06, + "loss": 0.5039, + "step": 294 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 0.6226675510406494, + "learning_rate": 9.312722948870394e-06, + "loss": 0.6403, + "step": 295 + }, + { + "epoch": 0.704133214391912, + "grad_norm": 0.6270530819892883, + "learning_rate": 9.310344827586207e-06, + "loss": 0.5698, + "step": 296 + }, + { + "epoch": 0.7065120428189117, + "grad_norm": 0.5762889385223389, + "learning_rate": 9.307966706302021e-06, + "loss": 0.6574, + "step": 297 + }, + { + "epoch": 0.7088908712459114, + "grad_norm": 0.5319334268569946, + "learning_rate": 9.305588585017837e-06, + "loss": 0.5939, + "step": 298 + }, + { + "epoch": 0.7112696996729111, + "grad_norm": 0.5508785247802734, + "learning_rate": 9.30321046373365e-06, + "loss": 0.5655, + "step": 299 + }, + { + "epoch": 0.7136485280999108, + "grad_norm": 0.5131646394729614, + "learning_rate": 9.300832342449466e-06, + "loss": 0.5123, + "step": 300 + }, + { + "epoch": 0.7136485280999108, + "eval_loss": 0.5579515099525452, + "eval_runtime": 23.1839, + "eval_samples_per_second": 32.264, + "eval_steps_per_second": 16.132, + "step": 300 + }, + { + "epoch": 0.7160273565269105, + "grad_norm": 0.5203574299812317, + "learning_rate": 9.298454221165281e-06, + "loss": 0.545, + "step": 301 + }, + { + "epoch": 0.7184061849539102, + "grad_norm": 0.4805406332015991, + "learning_rate": 9.296076099881095e-06, + "loss": 0.4771, + "step": 302 + }, + { + "epoch": 0.72078501338091, + "grad_norm": 0.576305627822876, + "learning_rate": 9.293697978596909e-06, + "loss": 0.5438, + "step": 303 + }, + { + "epoch": 0.7231638418079096, + "grad_norm": 0.5781462788581848, + "learning_rate": 9.291319857312724e-06, + "loss": 0.5565, + "step": 304 + }, + { + "epoch": 0.7255426702349093, + "grad_norm": 0.6189631819725037, + "learning_rate": 9.288941736028538e-06, + "loss": 0.611, + "step": 305 + }, + { + "epoch": 0.727921498661909, + "grad_norm": 0.5450212955474854, + "learning_rate": 9.286563614744354e-06, + "loss": 0.4782, + "step": 306 + }, + { + "epoch": 0.7303003270889087, + "grad_norm": 0.5933527946472168, + "learning_rate": 9.284185493460167e-06, + "loss": 0.6207, + "step": 307 + }, + { + "epoch": 0.7326791555159085, + "grad_norm": 0.4986989200115204, + "learning_rate": 9.281807372175981e-06, + "loss": 0.5558, + "step": 308 + }, + { + "epoch": 0.7350579839429081, + "grad_norm": 0.48861125111579895, + "learning_rate": 9.279429250891797e-06, + "loss": 0.5039, + "step": 309 + }, + { + "epoch": 0.7374368123699078, + "grad_norm": 0.5433496832847595, + "learning_rate": 9.27705112960761e-06, + "loss": 0.5078, + "step": 310 + }, + { + "epoch": 0.7398156407969075, + "grad_norm": 0.5490679740905762, + "learning_rate": 9.274673008323424e-06, + "loss": 0.4537, + "step": 311 + }, + { + "epoch": 0.7421944692239072, + "grad_norm": 0.6522793769836426, + "learning_rate": 9.27229488703924e-06, + "loss": 0.5973, + "step": 312 + }, + { + "epoch": 0.744573297650907, + "grad_norm": 0.5692256093025208, + "learning_rate": 9.269916765755053e-06, + "loss": 0.5531, + "step": 313 + }, + { + "epoch": 0.7469521260779066, + "grad_norm": 0.5248643755912781, + "learning_rate": 9.267538644470869e-06, + "loss": 0.5063, + "step": 314 + }, + { + "epoch": 0.7493309545049064, + "grad_norm": 0.5677797794342041, + "learning_rate": 9.265160523186684e-06, + "loss": 0.5535, + "step": 315 + }, + { + "epoch": 0.751709782931906, + "grad_norm": 0.5678485035896301, + "learning_rate": 9.262782401902498e-06, + "loss": 0.5675, + "step": 316 + }, + { + "epoch": 0.7540886113589057, + "grad_norm": 0.5687801837921143, + "learning_rate": 9.260404280618312e-06, + "loss": 0.5332, + "step": 317 + }, + { + "epoch": 0.7564674397859055, + "grad_norm": 0.595946192741394, + "learning_rate": 9.258026159334127e-06, + "loss": 0.5631, + "step": 318 + }, + { + "epoch": 0.7588462682129051, + "grad_norm": 0.4984569847583771, + "learning_rate": 9.255648038049941e-06, + "loss": 0.5061, + "step": 319 + }, + { + "epoch": 0.7612250966399049, + "grad_norm": 0.5868138670921326, + "learning_rate": 9.253269916765756e-06, + "loss": 0.5106, + "step": 320 + }, + { + "epoch": 0.7636039250669046, + "grad_norm": 0.6345978379249573, + "learning_rate": 9.25089179548157e-06, + "loss": 0.5357, + "step": 321 + }, + { + "epoch": 0.7659827534939042, + "grad_norm": 0.5315825343132019, + "learning_rate": 9.248513674197384e-06, + "loss": 0.4493, + "step": 322 + }, + { + "epoch": 0.768361581920904, + "grad_norm": 0.5738168954849243, + "learning_rate": 9.2461355529132e-06, + "loss": 0.6396, + "step": 323 + }, + { + "epoch": 0.7707404103479036, + "grad_norm": 0.5067557692527771, + "learning_rate": 9.243757431629013e-06, + "loss": 0.5359, + "step": 324 + }, + { + "epoch": 0.7731192387749034, + "grad_norm": 0.5229761600494385, + "learning_rate": 9.241379310344829e-06, + "loss": 0.509, + "step": 325 + }, + { + "epoch": 0.7754980672019031, + "grad_norm": 0.5950191617012024, + "learning_rate": 9.239001189060642e-06, + "loss": 0.5078, + "step": 326 + }, + { + "epoch": 0.7778768956289027, + "grad_norm": 0.5776432156562805, + "learning_rate": 9.236623067776458e-06, + "loss": 0.4654, + "step": 327 + }, + { + "epoch": 0.7802557240559025, + "grad_norm": 0.6692399382591248, + "learning_rate": 9.234244946492272e-06, + "loss": 0.5661, + "step": 328 + }, + { + "epoch": 0.7826345524829021, + "grad_norm": 0.5446496605873108, + "learning_rate": 9.231866825208087e-06, + "loss": 0.5771, + "step": 329 + }, + { + "epoch": 0.7850133809099019, + "grad_norm": 0.5848485827445984, + "learning_rate": 9.229488703923901e-06, + "loss": 0.4715, + "step": 330 + }, + { + "epoch": 0.7873922093369016, + "grad_norm": 0.5969777703285217, + "learning_rate": 9.227110582639716e-06, + "loss": 0.5864, + "step": 331 + }, + { + "epoch": 0.7897710377639012, + "grad_norm": 0.5181068778038025, + "learning_rate": 9.22473246135553e-06, + "loss": 0.5265, + "step": 332 + }, + { + "epoch": 0.792149866190901, + "grad_norm": 0.6710075736045837, + "learning_rate": 9.222354340071344e-06, + "loss": 0.6011, + "step": 333 + }, + { + "epoch": 0.7945286946179007, + "grad_norm": 0.5986305475234985, + "learning_rate": 9.21997621878716e-06, + "loss": 0.5108, + "step": 334 + }, + { + "epoch": 0.7969075230449004, + "grad_norm": 0.6084244847297668, + "learning_rate": 9.217598097502973e-06, + "loss": 0.6043, + "step": 335 + }, + { + "epoch": 0.7992863514719001, + "grad_norm": 0.5425276160240173, + "learning_rate": 9.215219976218789e-06, + "loss": 0.5145, + "step": 336 + }, + { + "epoch": 0.8016651798988997, + "grad_norm": 0.5162472724914551, + "learning_rate": 9.212841854934602e-06, + "loss": 0.5822, + "step": 337 + }, + { + "epoch": 0.8040440083258995, + "grad_norm": 0.6387590169906616, + "learning_rate": 9.210463733650416e-06, + "loss": 0.5015, + "step": 338 + }, + { + "epoch": 0.8064228367528992, + "grad_norm": 0.5944037437438965, + "learning_rate": 9.208085612366232e-06, + "loss": 0.5094, + "step": 339 + }, + { + "epoch": 0.8088016651798989, + "grad_norm": 0.6407771706581116, + "learning_rate": 9.205707491082045e-06, + "loss": 0.5578, + "step": 340 + }, + { + "epoch": 0.8111804936068986, + "grad_norm": 0.5614750385284424, + "learning_rate": 9.20332936979786e-06, + "loss": 0.4634, + "step": 341 + }, + { + "epoch": 0.8135593220338984, + "grad_norm": 0.5568011403083801, + "learning_rate": 9.200951248513676e-06, + "loss": 0.5044, + "step": 342 + }, + { + "epoch": 0.815938150460898, + "grad_norm": 0.7975798845291138, + "learning_rate": 9.19857312722949e-06, + "loss": 0.6599, + "step": 343 + }, + { + "epoch": 0.8183169788878977, + "grad_norm": 0.6632049679756165, + "learning_rate": 9.196195005945304e-06, + "loss": 0.5918, + "step": 344 + }, + { + "epoch": 0.8206958073148974, + "grad_norm": 0.608599066734314, + "learning_rate": 9.19381688466112e-06, + "loss": 0.5666, + "step": 345 + }, + { + "epoch": 0.8230746357418971, + "grad_norm": 0.6245832443237305, + "learning_rate": 9.191438763376933e-06, + "loss": 0.5934, + "step": 346 + }, + { + "epoch": 0.8254534641688969, + "grad_norm": 0.5409818291664124, + "learning_rate": 9.189060642092747e-06, + "loss": 0.482, + "step": 347 + }, + { + "epoch": 0.8278322925958965, + "grad_norm": 0.5313441753387451, + "learning_rate": 9.186682520808562e-06, + "loss": 0.5204, + "step": 348 + }, + { + "epoch": 0.8302111210228962, + "grad_norm": 0.5901579260826111, + "learning_rate": 9.184304399524376e-06, + "loss": 0.5404, + "step": 349 + }, + { + "epoch": 0.832589949449896, + "grad_norm": 0.6241204142570496, + "learning_rate": 9.181926278240191e-06, + "loss": 0.4884, + "step": 350 + }, + { + "epoch": 0.832589949449896, + "eval_loss": 0.5422665476799011, + "eval_runtime": 23.4421, + "eval_samples_per_second": 31.908, + "eval_steps_per_second": 15.954, + "step": 350 + }, + { + "epoch": 0.8349687778768956, + "grad_norm": 0.5375258326530457, + "learning_rate": 9.179548156956005e-06, + "loss": 0.4967, + "step": 351 + }, + { + "epoch": 0.8373476063038954, + "grad_norm": 0.6130648851394653, + "learning_rate": 9.177170035671819e-06, + "loss": 0.5363, + "step": 352 + }, + { + "epoch": 0.839726434730895, + "grad_norm": 0.5307011008262634, + "learning_rate": 9.174791914387634e-06, + "loss": 0.4953, + "step": 353 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.5408076047897339, + "learning_rate": 9.172413793103448e-06, + "loss": 0.479, + "step": 354 + }, + { + "epoch": 0.8444840915848945, + "grad_norm": 0.529685914516449, + "learning_rate": 9.170035671819264e-06, + "loss": 0.4891, + "step": 355 + }, + { + "epoch": 0.8468629200118941, + "grad_norm": 0.6279693841934204, + "learning_rate": 9.167657550535079e-06, + "loss": 0.5621, + "step": 356 + }, + { + "epoch": 0.8492417484388939, + "grad_norm": 0.5937814116477966, + "learning_rate": 9.165279429250893e-06, + "loss": 0.5259, + "step": 357 + }, + { + "epoch": 0.8516205768658935, + "grad_norm": 0.5240765810012817, + "learning_rate": 9.162901307966707e-06, + "loss": 0.4951, + "step": 358 + }, + { + "epoch": 0.8539994052928932, + "grad_norm": 0.6922802329063416, + "learning_rate": 9.160523186682522e-06, + "loss": 0.5568, + "step": 359 + }, + { + "epoch": 0.856378233719893, + "grad_norm": 0.49827560782432556, + "learning_rate": 9.158145065398336e-06, + "loss": 0.4906, + "step": 360 + }, + { + "epoch": 0.8587570621468926, + "grad_norm": 0.5908154845237732, + "learning_rate": 9.155766944114151e-06, + "loss": 0.589, + "step": 361 + }, + { + "epoch": 0.8611358905738924, + "grad_norm": 0.603060781955719, + "learning_rate": 9.153388822829965e-06, + "loss": 0.5772, + "step": 362 + }, + { + "epoch": 0.863514719000892, + "grad_norm": 0.5168816447257996, + "learning_rate": 9.151010701545779e-06, + "loss": 0.4313, + "step": 363 + }, + { + "epoch": 0.8658935474278918, + "grad_norm": 0.5934916138648987, + "learning_rate": 9.148632580261594e-06, + "loss": 0.5665, + "step": 364 + }, + { + "epoch": 0.8682723758548915, + "grad_norm": 0.583383321762085, + "learning_rate": 9.146254458977408e-06, + "loss": 0.5713, + "step": 365 + }, + { + "epoch": 0.8706512042818911, + "grad_norm": 0.4994150698184967, + "learning_rate": 9.143876337693223e-06, + "loss": 0.4724, + "step": 366 + }, + { + "epoch": 0.8730300327088909, + "grad_norm": 0.5267557501792908, + "learning_rate": 9.141498216409037e-06, + "loss": 0.5026, + "step": 367 + }, + { + "epoch": 0.8754088611358906, + "grad_norm": 0.5061285495758057, + "learning_rate": 9.139120095124853e-06, + "loss": 0.5575, + "step": 368 + }, + { + "epoch": 0.8777876895628903, + "grad_norm": 0.6534325480461121, + "learning_rate": 9.136741973840666e-06, + "loss": 0.5377, + "step": 369 + }, + { + "epoch": 0.88016651798989, + "grad_norm": 0.5024367570877075, + "learning_rate": 9.134363852556482e-06, + "loss": 0.452, + "step": 370 + }, + { + "epoch": 0.8825453464168896, + "grad_norm": 0.584671139717102, + "learning_rate": 9.131985731272296e-06, + "loss": 0.4928, + "step": 371 + }, + { + "epoch": 0.8849241748438894, + "grad_norm": 0.5525078177452087, + "learning_rate": 9.129607609988111e-06, + "loss": 0.4383, + "step": 372 + }, + { + "epoch": 0.8873030032708891, + "grad_norm": 0.5527409911155701, + "learning_rate": 9.127229488703925e-06, + "loss": 0.4923, + "step": 373 + }, + { + "epoch": 0.8896818316978888, + "grad_norm": 0.5926960110664368, + "learning_rate": 9.124851367419739e-06, + "loss": 0.5009, + "step": 374 + }, + { + "epoch": 0.8920606601248885, + "grad_norm": 0.6386744976043701, + "learning_rate": 9.122473246135554e-06, + "loss": 0.6297, + "step": 375 + }, + { + "epoch": 0.8944394885518882, + "grad_norm": 0.5767717957496643, + "learning_rate": 9.120095124851368e-06, + "loss": 0.4848, + "step": 376 + }, + { + "epoch": 0.8968183169788879, + "grad_norm": 0.5442641377449036, + "learning_rate": 9.117717003567182e-06, + "loss": 0.5131, + "step": 377 + }, + { + "epoch": 0.8991971454058876, + "grad_norm": 0.5158716440200806, + "learning_rate": 9.115338882282997e-06, + "loss": 0.489, + "step": 378 + }, + { + "epoch": 0.9015759738328873, + "grad_norm": 0.597083330154419, + "learning_rate": 9.112960760998811e-06, + "loss": 0.5914, + "step": 379 + }, + { + "epoch": 0.903954802259887, + "grad_norm": 0.5438734889030457, + "learning_rate": 9.110582639714626e-06, + "loss": 0.4921, + "step": 380 + }, + { + "epoch": 0.9063336306868867, + "grad_norm": 0.4988299012184143, + "learning_rate": 9.10820451843044e-06, + "loss": 0.4672, + "step": 381 + }, + { + "epoch": 0.9087124591138864, + "grad_norm": 0.6793142557144165, + "learning_rate": 9.105826397146256e-06, + "loss": 0.5493, + "step": 382 + }, + { + "epoch": 0.9110912875408861, + "grad_norm": 0.567133367061615, + "learning_rate": 9.10344827586207e-06, + "loss": 0.4777, + "step": 383 + }, + { + "epoch": 0.9134701159678859, + "grad_norm": 0.590007483959198, + "learning_rate": 9.101070154577885e-06, + "loss": 0.6682, + "step": 384 + }, + { + "epoch": 0.9158489443948855, + "grad_norm": 0.589850902557373, + "learning_rate": 9.098692033293699e-06, + "loss": 0.5487, + "step": 385 + }, + { + "epoch": 0.9182277728218852, + "grad_norm": 0.5666838884353638, + "learning_rate": 9.096313912009514e-06, + "loss": 0.5366, + "step": 386 + }, + { + "epoch": 0.9206066012488849, + "grad_norm": 0.6215356588363647, + "learning_rate": 9.093935790725328e-06, + "loss": 0.4893, + "step": 387 + }, + { + "epoch": 0.9229854296758846, + "grad_norm": 0.5517717003822327, + "learning_rate": 9.091557669441142e-06, + "loss": 0.4951, + "step": 388 + }, + { + "epoch": 0.9253642581028844, + "grad_norm": 0.6346622109413147, + "learning_rate": 9.089179548156957e-06, + "loss": 0.5525, + "step": 389 + }, + { + "epoch": 0.927743086529884, + "grad_norm": 0.6366156339645386, + "learning_rate": 9.08680142687277e-06, + "loss": 0.6088, + "step": 390 + }, + { + "epoch": 0.9301219149568838, + "grad_norm": 0.6186984777450562, + "learning_rate": 9.084423305588586e-06, + "loss": 0.5018, + "step": 391 + }, + { + "epoch": 0.9325007433838834, + "grad_norm": 0.602583110332489, + "learning_rate": 9.0820451843044e-06, + "loss": 0.5074, + "step": 392 + }, + { + "epoch": 0.9348795718108831, + "grad_norm": 0.5531905889511108, + "learning_rate": 9.079667063020214e-06, + "loss": 0.5089, + "step": 393 + }, + { + "epoch": 0.9372584002378829, + "grad_norm": 0.49984800815582275, + "learning_rate": 9.07728894173603e-06, + "loss": 0.4691, + "step": 394 + }, + { + "epoch": 0.9396372286648825, + "grad_norm": 0.5420375466346741, + "learning_rate": 9.074910820451843e-06, + "loss": 0.491, + "step": 395 + }, + { + "epoch": 0.9420160570918823, + "grad_norm": 0.5088158845901489, + "learning_rate": 9.072532699167658e-06, + "loss": 0.5175, + "step": 396 + }, + { + "epoch": 0.944394885518882, + "grad_norm": 0.5295800566673279, + "learning_rate": 9.070154577883474e-06, + "loss": 0.4608, + "step": 397 + }, + { + "epoch": 0.9467737139458816, + "grad_norm": 0.5850183963775635, + "learning_rate": 9.067776456599288e-06, + "loss": 0.4665, + "step": 398 + }, + { + "epoch": 0.9491525423728814, + "grad_norm": 0.6050384044647217, + "learning_rate": 9.065398335315101e-06, + "loss": 0.5676, + "step": 399 + }, + { + "epoch": 0.951531370799881, + "grad_norm": 0.5520824790000916, + "learning_rate": 9.063020214030917e-06, + "loss": 0.5574, + "step": 400 + }, + { + "epoch": 0.951531370799881, + "eval_loss": 0.5304277539253235, + "eval_runtime": 23.5688, + "eval_samples_per_second": 31.737, + "eval_steps_per_second": 15.868, + "step": 400 + }, + { + "epoch": 0.9539101992268808, + "grad_norm": 0.7439431548118591, + "learning_rate": 9.06064209274673e-06, + "loss": 0.5033, + "step": 401 + }, + { + "epoch": 0.9562890276538805, + "grad_norm": 0.9271299242973328, + "learning_rate": 9.058263971462546e-06, + "loss": 0.5795, + "step": 402 + }, + { + "epoch": 0.9586678560808801, + "grad_norm": 0.7191694378852844, + "learning_rate": 9.05588585017836e-06, + "loss": 0.5326, + "step": 403 + }, + { + "epoch": 0.9610466845078799, + "grad_norm": 0.6732980608940125, + "learning_rate": 9.053507728894174e-06, + "loss": 0.597, + "step": 404 + }, + { + "epoch": 0.9634255129348795, + "grad_norm": 0.6190123558044434, + "learning_rate": 9.051129607609989e-06, + "loss": 0.4671, + "step": 405 + }, + { + "epoch": 0.9658043413618793, + "grad_norm": 0.6509968638420105, + "learning_rate": 9.048751486325803e-06, + "loss": 0.5995, + "step": 406 + }, + { + "epoch": 0.968183169788879, + "grad_norm": 0.5763471722602844, + "learning_rate": 9.046373365041618e-06, + "loss": 0.4908, + "step": 407 + }, + { + "epoch": 0.9705619982158786, + "grad_norm": 0.5502136945724487, + "learning_rate": 9.043995243757432e-06, + "loss": 0.5273, + "step": 408 + }, + { + "epoch": 0.9729408266428784, + "grad_norm": 0.5641132593154907, + "learning_rate": 9.041617122473248e-06, + "loss": 0.4922, + "step": 409 + }, + { + "epoch": 0.975319655069878, + "grad_norm": 0.5395079255104065, + "learning_rate": 9.039239001189061e-06, + "loss": 0.5187, + "step": 410 + }, + { + "epoch": 0.9776984834968778, + "grad_norm": 0.691702127456665, + "learning_rate": 9.036860879904877e-06, + "loss": 0.5675, + "step": 411 + }, + { + "epoch": 0.9800773119238775, + "grad_norm": 0.6059073805809021, + "learning_rate": 9.03448275862069e-06, + "loss": 0.5325, + "step": 412 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 0.6118643879890442, + "learning_rate": 9.032104637336506e-06, + "loss": 0.5276, + "step": 413 + }, + { + "epoch": 0.9848349687778769, + "grad_norm": 0.5294259786605835, + "learning_rate": 9.02972651605232e-06, + "loss": 0.5135, + "step": 414 + }, + { + "epoch": 0.9872137972048766, + "grad_norm": 0.5516476035118103, + "learning_rate": 9.027348394768133e-06, + "loss": 0.5303, + "step": 415 + }, + { + "epoch": 0.9895926256318763, + "grad_norm": 0.6588349342346191, + "learning_rate": 9.024970273483949e-06, + "loss": 0.4645, + "step": 416 + }, + { + "epoch": 0.991971454058876, + "grad_norm": 0.6539220213890076, + "learning_rate": 9.022592152199763e-06, + "loss": 0.6042, + "step": 417 + }, + { + "epoch": 0.9943502824858758, + "grad_norm": 0.6062245965003967, + "learning_rate": 9.020214030915576e-06, + "loss": 0.5584, + "step": 418 + }, + { + "epoch": 0.9967291109128754, + "grad_norm": 0.6036509871482849, + "learning_rate": 9.017835909631392e-06, + "loss": 0.5103, + "step": 419 + }, + { + "epoch": 0.9991079393398751, + "grad_norm": 0.6089546084403992, + "learning_rate": 9.015457788347206e-06, + "loss": 0.5005, + "step": 420 + }, + { + "epoch": 1.0, + "grad_norm": 1.0930063724517822, + "learning_rate": 9.013079667063021e-06, + "loss": 0.6692, + "step": 421 + }, + { + "epoch": 1.0023788284269997, + "grad_norm": 0.5539116859436035, + "learning_rate": 9.010701545778835e-06, + "loss": 0.44, + "step": 422 + }, + { + "epoch": 1.0047576568539993, + "grad_norm": 0.5324768424034119, + "learning_rate": 9.00832342449465e-06, + "loss": 0.5271, + "step": 423 + }, + { + "epoch": 1.0071364852809992, + "grad_norm": 0.6338086128234863, + "learning_rate": 9.005945303210464e-06, + "loss": 0.5773, + "step": 424 + }, + { + "epoch": 1.0095153137079989, + "grad_norm": 0.5521774291992188, + "learning_rate": 9.00356718192628e-06, + "loss": 0.612, + "step": 425 + }, + { + "epoch": 1.0118941421349985, + "grad_norm": 0.6046401262283325, + "learning_rate": 9.001189060642093e-06, + "loss": 0.4908, + "step": 426 + }, + { + "epoch": 1.0142729705619982, + "grad_norm": 0.6353985071182251, + "learning_rate": 8.998810939357909e-06, + "loss": 0.507, + "step": 427 + }, + { + "epoch": 1.0166517989889978, + "grad_norm": 0.6239448189735413, + "learning_rate": 8.996432818073723e-06, + "loss": 0.5562, + "step": 428 + }, + { + "epoch": 1.0190306274159977, + "grad_norm": 0.5814323425292969, + "learning_rate": 8.994054696789536e-06, + "loss": 0.5452, + "step": 429 + }, + { + "epoch": 1.0214094558429974, + "grad_norm": 0.576495349407196, + "learning_rate": 8.991676575505352e-06, + "loss": 0.5021, + "step": 430 + }, + { + "epoch": 1.023788284269997, + "grad_norm": 0.6266391277313232, + "learning_rate": 8.989298454221166e-06, + "loss": 0.4893, + "step": 431 + }, + { + "epoch": 1.0261671126969967, + "grad_norm": 0.6679175496101379, + "learning_rate": 8.986920332936981e-06, + "loss": 0.5393, + "step": 432 + }, + { + "epoch": 1.0285459411239963, + "grad_norm": 0.6412615180015564, + "learning_rate": 8.984542211652795e-06, + "loss": 0.4283, + "step": 433 + }, + { + "epoch": 1.0309247695509962, + "grad_norm": 0.7210113406181335, + "learning_rate": 8.982164090368609e-06, + "loss": 0.5104, + "step": 434 + }, + { + "epoch": 1.0333035979779959, + "grad_norm": 0.6056954264640808, + "learning_rate": 8.979785969084424e-06, + "loss": 0.4727, + "step": 435 + }, + { + "epoch": 1.0356824264049955, + "grad_norm": 0.6095535755157471, + "learning_rate": 8.977407847800238e-06, + "loss": 0.5935, + "step": 436 + }, + { + "epoch": 1.0380612548319952, + "grad_norm": 0.6486533880233765, + "learning_rate": 8.975029726516053e-06, + "loss": 0.4872, + "step": 437 + }, + { + "epoch": 1.0404400832589948, + "grad_norm": 0.687177836894989, + "learning_rate": 8.972651605231869e-06, + "loss": 0.6011, + "step": 438 + }, + { + "epoch": 1.0428189116859947, + "grad_norm": 0.5973390340805054, + "learning_rate": 8.970273483947682e-06, + "loss": 0.5202, + "step": 439 + }, + { + "epoch": 1.0451977401129944, + "grad_norm": 0.5381523966789246, + "learning_rate": 8.967895362663496e-06, + "loss": 0.409, + "step": 440 + }, + { + "epoch": 1.047576568539994, + "grad_norm": 0.6410127282142639, + "learning_rate": 8.965517241379312e-06, + "loss": 0.5723, + "step": 441 + }, + { + "epoch": 1.0499553969669937, + "grad_norm": 0.563360333442688, + "learning_rate": 8.963139120095125e-06, + "loss": 0.4656, + "step": 442 + }, + { + "epoch": 1.0523342253939933, + "grad_norm": 0.5769375562667847, + "learning_rate": 8.960760998810941e-06, + "loss": 0.4554, + "step": 443 + }, + { + "epoch": 1.0547130538209932, + "grad_norm": 0.6085939407348633, + "learning_rate": 8.958382877526755e-06, + "loss": 0.4151, + "step": 444 + }, + { + "epoch": 1.0570918822479929, + "grad_norm": 0.5880855321884155, + "learning_rate": 8.956004756242568e-06, + "loss": 0.4889, + "step": 445 + }, + { + "epoch": 1.0594707106749925, + "grad_norm": 0.6923370957374573, + "learning_rate": 8.953626634958384e-06, + "loss": 0.5396, + "step": 446 + }, + { + "epoch": 1.0618495391019922, + "grad_norm": 0.5577448010444641, + "learning_rate": 8.951248513674198e-06, + "loss": 0.3858, + "step": 447 + }, + { + "epoch": 1.064228367528992, + "grad_norm": 0.6180370450019836, + "learning_rate": 8.948870392390011e-06, + "loss": 0.4722, + "step": 448 + }, + { + "epoch": 1.0666071959559917, + "grad_norm": 0.65920090675354, + "learning_rate": 8.946492271105827e-06, + "loss": 0.4969, + "step": 449 + }, + { + "epoch": 1.0689860243829914, + "grad_norm": 0.6248946785926819, + "learning_rate": 8.944114149821642e-06, + "loss": 0.5321, + "step": 450 + }, + { + "epoch": 1.0689860243829914, + "eval_loss": 0.5191164612770081, + "eval_runtime": 23.7896, + "eval_samples_per_second": 31.442, + "eval_steps_per_second": 15.721, + "step": 450 + }, + { + "epoch": 1.071364852809991, + "grad_norm": 0.5661522746086121, + "learning_rate": 8.941736028537456e-06, + "loss": 0.483, + "step": 451 + }, + { + "epoch": 1.0737436812369907, + "grad_norm": 0.6542887687683105, + "learning_rate": 8.939357907253272e-06, + "loss": 0.5263, + "step": 452 + }, + { + "epoch": 1.0761225096639904, + "grad_norm": 0.5866942405700684, + "learning_rate": 8.936979785969085e-06, + "loss": 0.5232, + "step": 453 + }, + { + "epoch": 1.0785013380909902, + "grad_norm": 0.7106343507766724, + "learning_rate": 8.934601664684899e-06, + "loss": 0.5469, + "step": 454 + }, + { + "epoch": 1.08088016651799, + "grad_norm": 0.5322448015213013, + "learning_rate": 8.932223543400715e-06, + "loss": 0.373, + "step": 455 + }, + { + "epoch": 1.0832589949449896, + "grad_norm": 0.6187466382980347, + "learning_rate": 8.929845422116528e-06, + "loss": 0.4849, + "step": 456 + }, + { + "epoch": 1.0856378233719892, + "grad_norm": 0.5687085390090942, + "learning_rate": 8.927467300832344e-06, + "loss": 0.5175, + "step": 457 + }, + { + "epoch": 1.088016651798989, + "grad_norm": 0.652417778968811, + "learning_rate": 8.925089179548158e-06, + "loss": 0.5215, + "step": 458 + }, + { + "epoch": 1.0903954802259888, + "grad_norm": 0.6115758419036865, + "learning_rate": 8.922711058263971e-06, + "loss": 0.4656, + "step": 459 + }, + { + "epoch": 1.0927743086529884, + "grad_norm": 0.7389148473739624, + "learning_rate": 8.920332936979787e-06, + "loss": 0.5891, + "step": 460 + }, + { + "epoch": 1.095153137079988, + "grad_norm": 0.619549036026001, + "learning_rate": 8.9179548156956e-06, + "loss": 0.4224, + "step": 461 + }, + { + "epoch": 1.0975319655069877, + "grad_norm": 0.6067652106285095, + "learning_rate": 8.915576694411416e-06, + "loss": 0.5265, + "step": 462 + }, + { + "epoch": 1.0999107939339876, + "grad_norm": 0.5228604078292847, + "learning_rate": 8.91319857312723e-06, + "loss": 0.4531, + "step": 463 + }, + { + "epoch": 1.1022896223609873, + "grad_norm": 0.6418589353561401, + "learning_rate": 8.910820451843045e-06, + "loss": 0.5694, + "step": 464 + }, + { + "epoch": 1.104668450787987, + "grad_norm": 0.6826334595680237, + "learning_rate": 8.908442330558859e-06, + "loss": 0.4704, + "step": 465 + }, + { + "epoch": 1.1070472792149866, + "grad_norm": 0.6287785768508911, + "learning_rate": 8.906064209274674e-06, + "loss": 0.5137, + "step": 466 + }, + { + "epoch": 1.1094261076419862, + "grad_norm": 0.6398607492446899, + "learning_rate": 8.903686087990488e-06, + "loss": 0.6614, + "step": 467 + }, + { + "epoch": 1.1118049360689861, + "grad_norm": 0.6058328747749329, + "learning_rate": 8.901307966706304e-06, + "loss": 0.4817, + "step": 468 + }, + { + "epoch": 1.1141837644959858, + "grad_norm": 0.571291983127594, + "learning_rate": 8.898929845422117e-06, + "loss": 0.5014, + "step": 469 + }, + { + "epoch": 1.1165625929229854, + "grad_norm": 0.6095448136329651, + "learning_rate": 8.896551724137931e-06, + "loss": 0.486, + "step": 470 + }, + { + "epoch": 1.118941421349985, + "grad_norm": 0.6294752359390259, + "learning_rate": 8.894173602853747e-06, + "loss": 0.479, + "step": 471 + }, + { + "epoch": 1.1213202497769847, + "grad_norm": 0.603005588054657, + "learning_rate": 8.89179548156956e-06, + "loss": 0.494, + "step": 472 + }, + { + "epoch": 1.1236990782039846, + "grad_norm": 0.7180692553520203, + "learning_rate": 8.889417360285376e-06, + "loss": 0.5248, + "step": 473 + }, + { + "epoch": 1.1260779066309843, + "grad_norm": 0.6523922681808472, + "learning_rate": 8.88703923900119e-06, + "loss": 0.5473, + "step": 474 + }, + { + "epoch": 1.128456735057984, + "grad_norm": 0.6043949127197266, + "learning_rate": 8.884661117717003e-06, + "loss": 0.493, + "step": 475 + }, + { + "epoch": 1.1308355634849836, + "grad_norm": 0.6946741342544556, + "learning_rate": 8.882282996432819e-06, + "loss": 0.5729, + "step": 476 + }, + { + "epoch": 1.1332143919119833, + "grad_norm": 0.5997697710990906, + "learning_rate": 8.879904875148633e-06, + "loss": 0.4448, + "step": 477 + }, + { + "epoch": 1.1355932203389831, + "grad_norm": 0.6388835310935974, + "learning_rate": 8.877526753864448e-06, + "loss": 0.4482, + "step": 478 + }, + { + "epoch": 1.1379720487659828, + "grad_norm": 0.6901105642318726, + "learning_rate": 8.875148632580263e-06, + "loss": 0.5373, + "step": 479 + }, + { + "epoch": 1.1403508771929824, + "grad_norm": 0.625331461429596, + "learning_rate": 8.872770511296077e-06, + "loss": 0.4504, + "step": 480 + }, + { + "epoch": 1.142729705619982, + "grad_norm": 0.7057226896286011, + "learning_rate": 8.870392390011891e-06, + "loss": 0.5181, + "step": 481 + }, + { + "epoch": 1.1451085340469818, + "grad_norm": 0.646955132484436, + "learning_rate": 8.868014268727706e-06, + "loss": 0.5087, + "step": 482 + }, + { + "epoch": 1.1474873624739816, + "grad_norm": 0.6027918457984924, + "learning_rate": 8.86563614744352e-06, + "loss": 0.4777, + "step": 483 + }, + { + "epoch": 1.1498661909009813, + "grad_norm": 0.661638617515564, + "learning_rate": 8.863258026159334e-06, + "loss": 0.5415, + "step": 484 + }, + { + "epoch": 1.152245019327981, + "grad_norm": 0.6393787860870361, + "learning_rate": 8.86087990487515e-06, + "loss": 0.484, + "step": 485 + }, + { + "epoch": 1.1546238477549806, + "grad_norm": 0.5878676772117615, + "learning_rate": 8.858501783590963e-06, + "loss": 0.4517, + "step": 486 + }, + { + "epoch": 1.1570026761819805, + "grad_norm": 0.6484283208847046, + "learning_rate": 8.856123662306779e-06, + "loss": 0.5513, + "step": 487 + }, + { + "epoch": 1.1593815046089802, + "grad_norm": 0.6063655018806458, + "learning_rate": 8.853745541022592e-06, + "loss": 0.4961, + "step": 488 + }, + { + "epoch": 1.1617603330359798, + "grad_norm": 0.6313378810882568, + "learning_rate": 8.851367419738406e-06, + "loss": 0.427, + "step": 489 + }, + { + "epoch": 1.1641391614629795, + "grad_norm": 0.6609371900558472, + "learning_rate": 8.848989298454222e-06, + "loss": 0.4389, + "step": 490 + }, + { + "epoch": 1.1665179898899791, + "grad_norm": 0.6960870623588562, + "learning_rate": 8.846611177170037e-06, + "loss": 0.5235, + "step": 491 + }, + { + "epoch": 1.1688968183169788, + "grad_norm": 0.5642465949058533, + "learning_rate": 8.844233055885851e-06, + "loss": 0.449, + "step": 492 + }, + { + "epoch": 1.1712756467439787, + "grad_norm": 0.5914607048034668, + "learning_rate": 8.841854934601666e-06, + "loss": 0.4702, + "step": 493 + }, + { + "epoch": 1.1736544751709783, + "grad_norm": 0.6541077494621277, + "learning_rate": 8.83947681331748e-06, + "loss": 0.5439, + "step": 494 + }, + { + "epoch": 1.176033303597978, + "grad_norm": 0.6423191428184509, + "learning_rate": 8.837098692033294e-06, + "loss": 0.5265, + "step": 495 + }, + { + "epoch": 1.1784121320249776, + "grad_norm": 0.5965075492858887, + "learning_rate": 8.83472057074911e-06, + "loss": 0.5128, + "step": 496 + }, + { + "epoch": 1.1807909604519775, + "grad_norm": 0.6090273857116699, + "learning_rate": 8.832342449464923e-06, + "loss": 0.4356, + "step": 497 + }, + { + "epoch": 1.1831697888789772, + "grad_norm": 0.622359037399292, + "learning_rate": 8.829964328180739e-06, + "loss": 0.5247, + "step": 498 + }, + { + "epoch": 1.1855486173059768, + "grad_norm": 0.6459600925445557, + "learning_rate": 8.827586206896552e-06, + "loss": 0.428, + "step": 499 + }, + { + "epoch": 1.1879274457329765, + "grad_norm": 0.6156396865844727, + "learning_rate": 8.825208085612366e-06, + "loss": 0.5075, + "step": 500 + }, + { + "epoch": 1.1879274457329765, + "eval_loss": 0.5090909004211426, + "eval_runtime": 23.2238, + "eval_samples_per_second": 32.208, + "eval_steps_per_second": 16.104, + "step": 500 + }, + { + "epoch": 1.1903062741599761, + "grad_norm": 0.5583279132843018, + "learning_rate": 8.822829964328182e-06, + "loss": 0.4755, + "step": 501 + }, + { + "epoch": 1.1926851025869758, + "grad_norm": 0.7325577139854431, + "learning_rate": 8.820451843043995e-06, + "loss": 0.5685, + "step": 502 + }, + { + "epoch": 1.1950639310139757, + "grad_norm": 0.6433960199356079, + "learning_rate": 8.81807372175981e-06, + "loss": 0.4906, + "step": 503 + }, + { + "epoch": 1.1974427594409753, + "grad_norm": 0.5431859493255615, + "learning_rate": 8.815695600475625e-06, + "loss": 0.4537, + "step": 504 + }, + { + "epoch": 1.199821587867975, + "grad_norm": 0.6516239047050476, + "learning_rate": 8.81331747919144e-06, + "loss": 0.4837, + "step": 505 + }, + { + "epoch": 1.2022004162949746, + "grad_norm": 0.6837879419326782, + "learning_rate": 8.810939357907254e-06, + "loss": 0.5125, + "step": 506 + }, + { + "epoch": 1.2045792447219745, + "grad_norm": 0.8031286001205444, + "learning_rate": 8.80856123662307e-06, + "loss": 0.5352, + "step": 507 + }, + { + "epoch": 1.2069580731489742, + "grad_norm": 0.7429251074790955, + "learning_rate": 8.806183115338883e-06, + "loss": 0.4994, + "step": 508 + }, + { + "epoch": 1.2093369015759738, + "grad_norm": 0.6564574241638184, + "learning_rate": 8.803804994054698e-06, + "loss": 0.531, + "step": 509 + }, + { + "epoch": 1.2117157300029735, + "grad_norm": 0.5722794532775879, + "learning_rate": 8.801426872770512e-06, + "loss": 0.4641, + "step": 510 + }, + { + "epoch": 1.2140945584299732, + "grad_norm": 0.7049525380134583, + "learning_rate": 8.799048751486326e-06, + "loss": 0.5087, + "step": 511 + }, + { + "epoch": 1.216473386856973, + "grad_norm": 0.5475321412086487, + "learning_rate": 8.796670630202141e-06, + "loss": 0.508, + "step": 512 + }, + { + "epoch": 1.2188522152839727, + "grad_norm": 0.6623636484146118, + "learning_rate": 8.794292508917955e-06, + "loss": 0.4994, + "step": 513 + }, + { + "epoch": 1.2212310437109724, + "grad_norm": 0.560063362121582, + "learning_rate": 8.79191438763377e-06, + "loss": 0.4711, + "step": 514 + }, + { + "epoch": 1.223609872137972, + "grad_norm": 0.7380957007408142, + "learning_rate": 8.789536266349584e-06, + "loss": 0.5037, + "step": 515 + }, + { + "epoch": 1.2259887005649717, + "grad_norm": 0.6598972678184509, + "learning_rate": 8.787158145065398e-06, + "loss": 0.5579, + "step": 516 + }, + { + "epoch": 1.2283675289919715, + "grad_norm": 0.7130924463272095, + "learning_rate": 8.784780023781214e-06, + "loss": 0.5343, + "step": 517 + }, + { + "epoch": 1.2307463574189712, + "grad_norm": 0.6275978088378906, + "learning_rate": 8.782401902497029e-06, + "loss": 0.4834, + "step": 518 + }, + { + "epoch": 1.2331251858459709, + "grad_norm": 0.6108331084251404, + "learning_rate": 8.780023781212843e-06, + "loss": 0.3973, + "step": 519 + }, + { + "epoch": 1.2355040142729705, + "grad_norm": 0.55642169713974, + "learning_rate": 8.777645659928658e-06, + "loss": 0.4856, + "step": 520 + }, + { + "epoch": 1.2378828426999702, + "grad_norm": 0.6282866597175598, + "learning_rate": 8.775267538644472e-06, + "loss": 0.4233, + "step": 521 + }, + { + "epoch": 1.24026167112697, + "grad_norm": 0.6580280661582947, + "learning_rate": 8.772889417360286e-06, + "loss": 0.5391, + "step": 522 + }, + { + "epoch": 1.2426404995539697, + "grad_norm": 0.6904391050338745, + "learning_rate": 8.770511296076101e-06, + "loss": 0.5525, + "step": 523 + }, + { + "epoch": 1.2450193279809694, + "grad_norm": 0.6049892902374268, + "learning_rate": 8.768133174791915e-06, + "loss": 0.5347, + "step": 524 + }, + { + "epoch": 1.247398156407969, + "grad_norm": 0.6755518317222595, + "learning_rate": 8.765755053507729e-06, + "loss": 0.4737, + "step": 525 + }, + { + "epoch": 1.2497769848349687, + "grad_norm": 0.5695744752883911, + "learning_rate": 8.763376932223544e-06, + "loss": 0.3831, + "step": 526 + }, + { + "epoch": 1.2521558132619686, + "grad_norm": 0.7018800377845764, + "learning_rate": 8.760998810939358e-06, + "loss": 0.5031, + "step": 527 + }, + { + "epoch": 1.2545346416889682, + "grad_norm": 0.6101343035697937, + "learning_rate": 8.758620689655173e-06, + "loss": 0.466, + "step": 528 + }, + { + "epoch": 1.2569134701159679, + "grad_norm": 0.6405492424964905, + "learning_rate": 8.756242568370987e-06, + "loss": 0.4236, + "step": 529 + }, + { + "epoch": 1.2592922985429675, + "grad_norm": 0.6529067158699036, + "learning_rate": 8.753864447086801e-06, + "loss": 0.5722, + "step": 530 + }, + { + "epoch": 1.2616711269699672, + "grad_norm": 0.6821762919425964, + "learning_rate": 8.751486325802616e-06, + "loss": 0.5247, + "step": 531 + }, + { + "epoch": 1.264049955396967, + "grad_norm": 0.598793089389801, + "learning_rate": 8.749108204518432e-06, + "loss": 0.4869, + "step": 532 + }, + { + "epoch": 1.2664287838239667, + "grad_norm": 0.6112446784973145, + "learning_rate": 8.746730083234246e-06, + "loss": 0.3759, + "step": 533 + }, + { + "epoch": 1.2688076122509664, + "grad_norm": 0.5927761197090149, + "learning_rate": 8.744351961950061e-06, + "loss": 0.4702, + "step": 534 + }, + { + "epoch": 1.271186440677966, + "grad_norm": 0.592856764793396, + "learning_rate": 8.741973840665875e-06, + "loss": 0.4764, + "step": 535 + }, + { + "epoch": 1.273565269104966, + "grad_norm": 0.6758366823196411, + "learning_rate": 8.739595719381689e-06, + "loss": 0.5322, + "step": 536 + }, + { + "epoch": 1.2759440975319656, + "grad_norm": 0.5624903440475464, + "learning_rate": 8.737217598097504e-06, + "loss": 0.4475, + "step": 537 + }, + { + "epoch": 1.2783229259589652, + "grad_norm": 0.5828344225883484, + "learning_rate": 8.734839476813318e-06, + "loss": 0.4225, + "step": 538 + }, + { + "epoch": 1.280701754385965, + "grad_norm": 0.6537852883338928, + "learning_rate": 8.732461355529133e-06, + "loss": 0.4305, + "step": 539 + }, + { + "epoch": 1.2830805828129646, + "grad_norm": 0.6657189130783081, + "learning_rate": 8.730083234244947e-06, + "loss": 0.4664, + "step": 540 + }, + { + "epoch": 1.2854594112399642, + "grad_norm": 0.6685323119163513, + "learning_rate": 8.727705112960761e-06, + "loss": 0.4672, + "step": 541 + }, + { + "epoch": 1.287838239666964, + "grad_norm": 0.6321278214454651, + "learning_rate": 8.725326991676576e-06, + "loss": 0.5096, + "step": 542 + }, + { + "epoch": 1.2902170680939637, + "grad_norm": 0.6335569620132446, + "learning_rate": 8.72294887039239e-06, + "loss": 0.5381, + "step": 543 + }, + { + "epoch": 1.2925958965209634, + "grad_norm": 0.6004913449287415, + "learning_rate": 8.720570749108206e-06, + "loss": 0.4897, + "step": 544 + }, + { + "epoch": 1.294974724947963, + "grad_norm": 0.677250325679779, + "learning_rate": 8.71819262782402e-06, + "loss": 0.5128, + "step": 545 + }, + { + "epoch": 1.297353553374963, + "grad_norm": 0.7083107233047485, + "learning_rate": 8.715814506539835e-06, + "loss": 0.528, + "step": 546 + }, + { + "epoch": 1.2997323818019626, + "grad_norm": 0.5648649334907532, + "learning_rate": 8.713436385255649e-06, + "loss": 0.4591, + "step": 547 + }, + { + "epoch": 1.3021112102289623, + "grad_norm": 0.6569650769233704, + "learning_rate": 8.711058263971464e-06, + "loss": 0.4695, + "step": 548 + }, + { + "epoch": 1.304490038655962, + "grad_norm": 0.6426310539245605, + "learning_rate": 8.708680142687278e-06, + "loss": 0.4801, + "step": 549 + }, + { + "epoch": 1.3068688670829616, + "grad_norm": 0.6406787037849426, + "learning_rate": 8.706302021403093e-06, + "loss": 0.4849, + "step": 550 + }, + { + "epoch": 1.3068688670829616, + "eval_loss": 0.5014833807945251, + "eval_runtime": 24.2183, + "eval_samples_per_second": 30.886, + "eval_steps_per_second": 15.443, + "step": 550 + }, + { + "epoch": 1.3092476955099612, + "grad_norm": 0.5631191730499268, + "learning_rate": 8.703923900118907e-06, + "loss": 0.4418, + "step": 551 + }, + { + "epoch": 1.311626523936961, + "grad_norm": 0.6141483187675476, + "learning_rate": 8.70154577883472e-06, + "loss": 0.5383, + "step": 552 + }, + { + "epoch": 1.3140053523639608, + "grad_norm": 0.7069640159606934, + "learning_rate": 8.699167657550536e-06, + "loss": 0.485, + "step": 553 + }, + { + "epoch": 1.3163841807909604, + "grad_norm": 0.7682774066925049, + "learning_rate": 8.69678953626635e-06, + "loss": 0.481, + "step": 554 + }, + { + "epoch": 1.31876300921796, + "grad_norm": 0.6545090079307556, + "learning_rate": 8.694411414982164e-06, + "loss": 0.5773, + "step": 555 + }, + { + "epoch": 1.32114183764496, + "grad_norm": 0.6870193481445312, + "learning_rate": 8.69203329369798e-06, + "loss": 0.4554, + "step": 556 + }, + { + "epoch": 1.3235206660719596, + "grad_norm": 0.6468110680580139, + "learning_rate": 8.689655172413793e-06, + "loss": 0.4976, + "step": 557 + }, + { + "epoch": 1.3258994944989593, + "grad_norm": 0.5881150960922241, + "learning_rate": 8.687277051129608e-06, + "loss": 0.4562, + "step": 558 + }, + { + "epoch": 1.328278322925959, + "grad_norm": 0.5610436201095581, + "learning_rate": 8.684898929845424e-06, + "loss": 0.4079, + "step": 559 + }, + { + "epoch": 1.3306571513529586, + "grad_norm": 0.6507015824317932, + "learning_rate": 8.682520808561238e-06, + "loss": 0.4336, + "step": 560 + }, + { + "epoch": 1.3330359797799582, + "grad_norm": 0.6858365535736084, + "learning_rate": 8.680142687277051e-06, + "loss": 0.417, + "step": 561 + }, + { + "epoch": 1.3354148082069581, + "grad_norm": 0.741806149482727, + "learning_rate": 8.677764565992867e-06, + "loss": 0.4697, + "step": 562 + }, + { + "epoch": 1.3377936366339578, + "grad_norm": 0.6826411485671997, + "learning_rate": 8.67538644470868e-06, + "loss": 0.4813, + "step": 563 + }, + { + "epoch": 1.3401724650609574, + "grad_norm": 0.6269429326057434, + "learning_rate": 8.673008323424496e-06, + "loss": 0.448, + "step": 564 + }, + { + "epoch": 1.342551293487957, + "grad_norm": 0.672722578048706, + "learning_rate": 8.67063020214031e-06, + "loss": 0.4944, + "step": 565 + }, + { + "epoch": 1.344930121914957, + "grad_norm": 0.6128204464912415, + "learning_rate": 8.668252080856124e-06, + "loss": 0.539, + "step": 566 + }, + { + "epoch": 1.3473089503419566, + "grad_norm": 0.7670498490333557, + "learning_rate": 8.665873959571939e-06, + "loss": 0.5361, + "step": 567 + }, + { + "epoch": 1.3496877787689563, + "grad_norm": 0.7337983846664429, + "learning_rate": 8.663495838287753e-06, + "loss": 0.4843, + "step": 568 + }, + { + "epoch": 1.352066607195956, + "grad_norm": 0.642518162727356, + "learning_rate": 8.661117717003568e-06, + "loss": 0.5051, + "step": 569 + }, + { + "epoch": 1.3544454356229556, + "grad_norm": 0.6973591446876526, + "learning_rate": 8.658739595719382e-06, + "loss": 0.5389, + "step": 570 + }, + { + "epoch": 1.3568242640499553, + "grad_norm": 0.7222164273262024, + "learning_rate": 8.656361474435196e-06, + "loss": 0.4719, + "step": 571 + }, + { + "epoch": 1.3592030924769551, + "grad_norm": 0.6675779819488525, + "learning_rate": 8.653983353151011e-06, + "loss": 0.4818, + "step": 572 + }, + { + "epoch": 1.3615819209039548, + "grad_norm": 0.7329903244972229, + "learning_rate": 8.651605231866827e-06, + "loss": 0.5575, + "step": 573 + }, + { + "epoch": 1.3639607493309545, + "grad_norm": 0.6188318133354187, + "learning_rate": 8.64922711058264e-06, + "loss": 0.4479, + "step": 574 + }, + { + "epoch": 1.3663395777579543, + "grad_norm": 0.7027018666267395, + "learning_rate": 8.646848989298456e-06, + "loss": 0.4955, + "step": 575 + }, + { + "epoch": 1.368718406184954, + "grad_norm": 0.6696898937225342, + "learning_rate": 8.64447086801427e-06, + "loss": 0.5068, + "step": 576 + }, + { + "epoch": 1.3710972346119537, + "grad_norm": 0.6100932955741882, + "learning_rate": 8.642092746730083e-06, + "loss": 0.4403, + "step": 577 + }, + { + "epoch": 1.3734760630389533, + "grad_norm": 0.6493490934371948, + "learning_rate": 8.639714625445899e-06, + "loss": 0.5086, + "step": 578 + }, + { + "epoch": 1.375854891465953, + "grad_norm": 0.6062465906143188, + "learning_rate": 8.637336504161713e-06, + "loss": 0.4498, + "step": 579 + }, + { + "epoch": 1.3782337198929526, + "grad_norm": 0.7127209305763245, + "learning_rate": 8.634958382877528e-06, + "loss": 0.5135, + "step": 580 + }, + { + "epoch": 1.3806125483199525, + "grad_norm": 0.7223253846168518, + "learning_rate": 8.632580261593342e-06, + "loss": 0.5457, + "step": 581 + }, + { + "epoch": 1.3829913767469522, + "grad_norm": 0.6758919358253479, + "learning_rate": 8.630202140309156e-06, + "loss": 0.4261, + "step": 582 + }, + { + "epoch": 1.3853702051739518, + "grad_norm": 0.6375535726547241, + "learning_rate": 8.627824019024971e-06, + "loss": 0.5267, + "step": 583 + }, + { + "epoch": 1.3877490336009515, + "grad_norm": 0.5236299633979797, + "learning_rate": 8.625445897740785e-06, + "loss": 0.4051, + "step": 584 + }, + { + "epoch": 1.3901278620279514, + "grad_norm": 0.6526315212249756, + "learning_rate": 8.6230677764566e-06, + "loss": 0.435, + "step": 585 + }, + { + "epoch": 1.392506690454951, + "grad_norm": 0.7031404972076416, + "learning_rate": 8.620689655172414e-06, + "loss": 0.3864, + "step": 586 + }, + { + "epoch": 1.3948855188819507, + "grad_norm": 0.7010306715965271, + "learning_rate": 8.61831153388823e-06, + "loss": 0.4666, + "step": 587 + }, + { + "epoch": 1.3972643473089503, + "grad_norm": 0.6778327822685242, + "learning_rate": 8.615933412604043e-06, + "loss": 0.4051, + "step": 588 + }, + { + "epoch": 1.39964317573595, + "grad_norm": 0.6586312651634216, + "learning_rate": 8.613555291319859e-06, + "loss": 0.5521, + "step": 589 + }, + { + "epoch": 1.4020220041629496, + "grad_norm": 0.6483843922615051, + "learning_rate": 8.611177170035673e-06, + "loss": 0.4305, + "step": 590 + }, + { + "epoch": 1.4044008325899495, + "grad_norm": 0.7792129516601562, + "learning_rate": 8.608799048751486e-06, + "loss": 0.501, + "step": 591 + }, + { + "epoch": 1.4067796610169492, + "grad_norm": 0.7350901365280151, + "learning_rate": 8.606420927467302e-06, + "loss": 0.5084, + "step": 592 + }, + { + "epoch": 1.4091584894439488, + "grad_norm": 0.6705347299575806, + "learning_rate": 8.604042806183116e-06, + "loss": 0.4315, + "step": 593 + }, + { + "epoch": 1.4115373178709485, + "grad_norm": 0.721199095249176, + "learning_rate": 8.601664684898931e-06, + "loss": 0.5829, + "step": 594 + }, + { + "epoch": 1.4139161462979484, + "grad_norm": 0.6253507733345032, + "learning_rate": 8.599286563614745e-06, + "loss": 0.4095, + "step": 595 + }, + { + "epoch": 1.416294974724948, + "grad_norm": 0.7098877429962158, + "learning_rate": 8.596908442330559e-06, + "loss": 0.4447, + "step": 596 + }, + { + "epoch": 1.4186738031519477, + "grad_norm": 0.7294139266014099, + "learning_rate": 8.594530321046374e-06, + "loss": 0.5665, + "step": 597 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.6800783276557922, + "learning_rate": 8.592152199762188e-06, + "loss": 0.466, + "step": 598 + }, + { + "epoch": 1.423431460005947, + "grad_norm": 0.6101962924003601, + "learning_rate": 8.589774078478003e-06, + "loss": 0.4822, + "step": 599 + }, + { + "epoch": 1.4258102884329467, + "grad_norm": 0.7860095500946045, + "learning_rate": 8.587395957193819e-06, + "loss": 0.5967, + "step": 600 + }, + { + "epoch": 1.4258102884329467, + "eval_loss": 0.49441060423851013, + "eval_runtime": 23.296, + "eval_samples_per_second": 32.108, + "eval_steps_per_second": 16.054, + "step": 600 + }, + { + "epoch": 1.4281891168599465, + "grad_norm": 0.6594057679176331, + "learning_rate": 8.585017835909632e-06, + "loss": 0.4716, + "step": 601 + }, + { + "epoch": 1.4305679452869462, + "grad_norm": 0.7036879062652588, + "learning_rate": 8.582639714625446e-06, + "loss": 0.4835, + "step": 602 + }, + { + "epoch": 1.4329467737139459, + "grad_norm": 0.7226179242134094, + "learning_rate": 8.580261593341262e-06, + "loss": 0.5633, + "step": 603 + }, + { + "epoch": 1.4353256021409455, + "grad_norm": 0.6342655420303345, + "learning_rate": 8.577883472057075e-06, + "loss": 0.4247, + "step": 604 + }, + { + "epoch": 1.4377044305679454, + "grad_norm": 0.6681719422340393, + "learning_rate": 8.575505350772891e-06, + "loss": 0.4509, + "step": 605 + }, + { + "epoch": 1.440083258994945, + "grad_norm": 0.6148503422737122, + "learning_rate": 8.573127229488705e-06, + "loss": 0.4499, + "step": 606 + }, + { + "epoch": 1.4424620874219447, + "grad_norm": 0.6982125043869019, + "learning_rate": 8.570749108204518e-06, + "loss": 0.456, + "step": 607 + }, + { + "epoch": 1.4448409158489444, + "grad_norm": 0.7340528964996338, + "learning_rate": 8.568370986920334e-06, + "loss": 0.5454, + "step": 608 + }, + { + "epoch": 1.447219744275944, + "grad_norm": 0.6691039800643921, + "learning_rate": 8.565992865636148e-06, + "loss": 0.5125, + "step": 609 + }, + { + "epoch": 1.4495985727029437, + "grad_norm": 0.8440302014350891, + "learning_rate": 8.563614744351963e-06, + "loss": 0.521, + "step": 610 + }, + { + "epoch": 1.4519774011299436, + "grad_norm": 0.6891091465950012, + "learning_rate": 8.561236623067777e-06, + "loss": 0.4924, + "step": 611 + }, + { + "epoch": 1.4543562295569432, + "grad_norm": 0.7212610840797424, + "learning_rate": 8.55885850178359e-06, + "loss": 0.5089, + "step": 612 + }, + { + "epoch": 1.4567350579839429, + "grad_norm": 0.6578139066696167, + "learning_rate": 8.556480380499406e-06, + "loss": 0.4951, + "step": 613 + }, + { + "epoch": 1.4591138864109425, + "grad_norm": 0.6543301343917847, + "learning_rate": 8.554102259215222e-06, + "loss": 0.4623, + "step": 614 + }, + { + "epoch": 1.4614927148379424, + "grad_norm": 0.6200469136238098, + "learning_rate": 8.551724137931035e-06, + "loss": 0.4958, + "step": 615 + }, + { + "epoch": 1.463871543264942, + "grad_norm": 0.6813637018203735, + "learning_rate": 8.54934601664685e-06, + "loss": 0.4956, + "step": 616 + }, + { + "epoch": 1.4662503716919417, + "grad_norm": 0.7992137670516968, + "learning_rate": 8.546967895362665e-06, + "loss": 0.5711, + "step": 617 + }, + { + "epoch": 1.4686292001189414, + "grad_norm": 0.6702547073364258, + "learning_rate": 8.544589774078478e-06, + "loss": 0.4848, + "step": 618 + }, + { + "epoch": 1.471008028545941, + "grad_norm": 0.7093238830566406, + "learning_rate": 8.542211652794294e-06, + "loss": 0.4628, + "step": 619 + }, + { + "epoch": 1.4733868569729407, + "grad_norm": 0.695806622505188, + "learning_rate": 8.539833531510108e-06, + "loss": 0.5224, + "step": 620 + }, + { + "epoch": 1.4757656853999406, + "grad_norm": 0.7120128273963928, + "learning_rate": 8.537455410225923e-06, + "loss": 0.4582, + "step": 621 + }, + { + "epoch": 1.4781445138269402, + "grad_norm": 0.6786816716194153, + "learning_rate": 8.535077288941737e-06, + "loss": 0.4832, + "step": 622 + }, + { + "epoch": 1.48052334225394, + "grad_norm": 0.6671195030212402, + "learning_rate": 8.53269916765755e-06, + "loss": 0.4611, + "step": 623 + }, + { + "epoch": 1.4829021706809398, + "grad_norm": 0.631097137928009, + "learning_rate": 8.530321046373366e-06, + "loss": 0.4509, + "step": 624 + }, + { + "epoch": 1.4852809991079394, + "grad_norm": 0.7614601254463196, + "learning_rate": 8.52794292508918e-06, + "loss": 0.4596, + "step": 625 + }, + { + "epoch": 1.487659827534939, + "grad_norm": 0.7357919812202454, + "learning_rate": 8.525564803804994e-06, + "loss": 0.465, + "step": 626 + }, + { + "epoch": 1.4900386559619387, + "grad_norm": 0.7816480994224548, + "learning_rate": 8.523186682520809e-06, + "loss": 0.5305, + "step": 627 + }, + { + "epoch": 1.4924174843889384, + "grad_norm": 0.6916353106498718, + "learning_rate": 8.520808561236624e-06, + "loss": 0.4567, + "step": 628 + }, + { + "epoch": 1.494796312815938, + "grad_norm": 0.6804481148719788, + "learning_rate": 8.518430439952438e-06, + "loss": 0.4437, + "step": 629 + }, + { + "epoch": 1.497175141242938, + "grad_norm": 0.6643516421318054, + "learning_rate": 8.516052318668254e-06, + "loss": 0.4379, + "step": 630 + }, + { + "epoch": 1.4995539696699376, + "grad_norm": 0.7280964255332947, + "learning_rate": 8.513674197384067e-06, + "loss": 0.5391, + "step": 631 + }, + { + "epoch": 1.5019327980969372, + "grad_norm": 0.7534157633781433, + "learning_rate": 8.511296076099881e-06, + "loss": 0.4687, + "step": 632 + }, + { + "epoch": 1.504311626523937, + "grad_norm": 0.7346697449684143, + "learning_rate": 8.508917954815697e-06, + "loss": 0.4738, + "step": 633 + }, + { + "epoch": 1.5066904549509368, + "grad_norm": 0.709813117980957, + "learning_rate": 8.50653983353151e-06, + "loss": 0.4253, + "step": 634 + }, + { + "epoch": 1.5090692833779364, + "grad_norm": 0.5872085094451904, + "learning_rate": 8.504161712247326e-06, + "loss": 0.4627, + "step": 635 + }, + { + "epoch": 1.511448111804936, + "grad_norm": 0.595168948173523, + "learning_rate": 8.50178359096314e-06, + "loss": 0.4202, + "step": 636 + }, + { + "epoch": 1.5138269402319358, + "grad_norm": 0.7629631757736206, + "learning_rate": 8.499405469678953e-06, + "loss": 0.439, + "step": 637 + }, + { + "epoch": 1.5162057686589354, + "grad_norm": 0.7511978149414062, + "learning_rate": 8.497027348394769e-06, + "loss": 0.5296, + "step": 638 + }, + { + "epoch": 1.518584597085935, + "grad_norm": 0.6487764120101929, + "learning_rate": 8.494649227110583e-06, + "loss": 0.4444, + "step": 639 + }, + { + "epoch": 1.5209634255129347, + "grad_norm": 0.6632949113845825, + "learning_rate": 8.492271105826398e-06, + "loss": 0.4097, + "step": 640 + }, + { + "epoch": 1.5233422539399346, + "grad_norm": 0.7085477709770203, + "learning_rate": 8.489892984542214e-06, + "loss": 0.5164, + "step": 641 + }, + { + "epoch": 1.5257210823669343, + "grad_norm": 0.6824425458908081, + "learning_rate": 8.487514863258027e-06, + "loss": 0.449, + "step": 642 + }, + { + "epoch": 1.5280999107939341, + "grad_norm": 0.6685028076171875, + "learning_rate": 8.485136741973841e-06, + "loss": 0.4815, + "step": 643 + }, + { + "epoch": 1.5304787392209338, + "grad_norm": 0.6634393334388733, + "learning_rate": 8.482758620689656e-06, + "loss": 0.5045, + "step": 644 + }, + { + "epoch": 1.5328575676479335, + "grad_norm": 0.7138300538063049, + "learning_rate": 8.48038049940547e-06, + "loss": 0.5351, + "step": 645 + }, + { + "epoch": 1.5352363960749331, + "grad_norm": 0.6326372027397156, + "learning_rate": 8.478002378121286e-06, + "loss": 0.4329, + "step": 646 + }, + { + "epoch": 1.5376152245019328, + "grad_norm": 0.675590455532074, + "learning_rate": 8.4756242568371e-06, + "loss": 0.4885, + "step": 647 + }, + { + "epoch": 1.5399940529289324, + "grad_norm": 0.6294907331466675, + "learning_rate": 8.473246135552913e-06, + "loss": 0.4387, + "step": 648 + }, + { + "epoch": 1.542372881355932, + "grad_norm": 0.7870957851409912, + "learning_rate": 8.470868014268729e-06, + "loss": 0.4854, + "step": 649 + }, + { + "epoch": 1.5447517097829317, + "grad_norm": 0.9078510403633118, + "learning_rate": 8.468489892984542e-06, + "loss": 0.4557, + "step": 650 + }, + { + "epoch": 1.5447517097829317, + "eval_loss": 0.4878033399581909, + "eval_runtime": 23.9964, + "eval_samples_per_second": 31.171, + "eval_steps_per_second": 15.586, + "step": 650 + }, + { + "epoch": 1.5471305382099316, + "grad_norm": 0.6073133945465088, + "learning_rate": 8.466111771700358e-06, + "loss": 0.4646, + "step": 651 + }, + { + "epoch": 1.5495093666369313, + "grad_norm": 0.616635799407959, + "learning_rate": 8.463733650416172e-06, + "loss": 0.5227, + "step": 652 + }, + { + "epoch": 1.5518881950639312, + "grad_norm": 0.6601746082305908, + "learning_rate": 8.461355529131985e-06, + "loss": 0.4734, + "step": 653 + }, + { + "epoch": 1.5542670234909308, + "grad_norm": 0.6344591379165649, + "learning_rate": 8.458977407847801e-06, + "loss": 0.4514, + "step": 654 + }, + { + "epoch": 1.5566458519179305, + "grad_norm": 0.7135318517684937, + "learning_rate": 8.456599286563616e-06, + "loss": 0.5089, + "step": 655 + }, + { + "epoch": 1.5590246803449301, + "grad_norm": 0.756536602973938, + "learning_rate": 8.45422116527943e-06, + "loss": 0.5135, + "step": 656 + }, + { + "epoch": 1.5614035087719298, + "grad_norm": 0.6871662735939026, + "learning_rate": 8.451843043995246e-06, + "loss": 0.5332, + "step": 657 + }, + { + "epoch": 1.5637823371989295, + "grad_norm": 0.6587780714035034, + "learning_rate": 8.44946492271106e-06, + "loss": 0.4312, + "step": 658 + }, + { + "epoch": 1.566161165625929, + "grad_norm": 0.6951320171356201, + "learning_rate": 8.447086801426873e-06, + "loss": 0.4536, + "step": 659 + }, + { + "epoch": 1.568539994052929, + "grad_norm": 0.6588094234466553, + "learning_rate": 8.444708680142689e-06, + "loss": 0.4642, + "step": 660 + }, + { + "epoch": 1.5709188224799286, + "grad_norm": 0.7331330180168152, + "learning_rate": 8.442330558858502e-06, + "loss": 0.4916, + "step": 661 + }, + { + "epoch": 1.5732976509069283, + "grad_norm": 0.6652212142944336, + "learning_rate": 8.439952437574316e-06, + "loss": 0.4963, + "step": 662 + }, + { + "epoch": 1.5756764793339282, + "grad_norm": 0.6374779343605042, + "learning_rate": 8.437574316290132e-06, + "loss": 0.4348, + "step": 663 + }, + { + "epoch": 1.5780553077609278, + "grad_norm": 0.6832917928695679, + "learning_rate": 8.435196195005945e-06, + "loss": 0.36, + "step": 664 + }, + { + "epoch": 1.5804341361879275, + "grad_norm": 0.733951210975647, + "learning_rate": 8.43281807372176e-06, + "loss": 0.4916, + "step": 665 + }, + { + "epoch": 1.5828129646149272, + "grad_norm": 0.7459485530853271, + "learning_rate": 8.430439952437575e-06, + "loss": 0.4507, + "step": 666 + }, + { + "epoch": 1.5851917930419268, + "grad_norm": 0.769730269908905, + "learning_rate": 8.428061831153388e-06, + "loss": 0.5268, + "step": 667 + }, + { + "epoch": 1.5875706214689265, + "grad_norm": 0.6664831042289734, + "learning_rate": 8.425683709869204e-06, + "loss": 0.4712, + "step": 668 + }, + { + "epoch": 1.5899494498959261, + "grad_norm": 0.673033595085144, + "learning_rate": 8.42330558858502e-06, + "loss": 0.3788, + "step": 669 + }, + { + "epoch": 1.592328278322926, + "grad_norm": 0.703697919845581, + "learning_rate": 8.420927467300833e-06, + "loss": 0.46, + "step": 670 + }, + { + "epoch": 1.5947071067499257, + "grad_norm": 0.6156996488571167, + "learning_rate": 8.418549346016648e-06, + "loss": 0.4419, + "step": 671 + }, + { + "epoch": 1.5970859351769253, + "grad_norm": 0.7024971842765808, + "learning_rate": 8.416171224732462e-06, + "loss": 0.5157, + "step": 672 + }, + { + "epoch": 1.5994647636039252, + "grad_norm": 0.7966590523719788, + "learning_rate": 8.413793103448276e-06, + "loss": 0.5456, + "step": 673 + }, + { + "epoch": 1.6018435920309249, + "grad_norm": 0.6824311017990112, + "learning_rate": 8.411414982164091e-06, + "loss": 0.5162, + "step": 674 + }, + { + "epoch": 1.6042224204579245, + "grad_norm": 0.7871618270874023, + "learning_rate": 8.409036860879905e-06, + "loss": 0.5661, + "step": 675 + }, + { + "epoch": 1.6066012488849242, + "grad_norm": 0.6961207389831543, + "learning_rate": 8.40665873959572e-06, + "loss": 0.4857, + "step": 676 + }, + { + "epoch": 1.6089800773119238, + "grad_norm": 0.6626442074775696, + "learning_rate": 8.404280618311534e-06, + "loss": 0.4368, + "step": 677 + }, + { + "epoch": 1.6113589057389235, + "grad_norm": 0.7076789736747742, + "learning_rate": 8.401902497027348e-06, + "loss": 0.4877, + "step": 678 + }, + { + "epoch": 1.6137377341659231, + "grad_norm": 0.6668829917907715, + "learning_rate": 8.399524375743164e-06, + "loss": 0.4316, + "step": 679 + }, + { + "epoch": 1.616116562592923, + "grad_norm": 0.6243900060653687, + "learning_rate": 8.397146254458977e-06, + "loss": 0.4531, + "step": 680 + }, + { + "epoch": 1.6184953910199227, + "grad_norm": 0.7516874074935913, + "learning_rate": 8.394768133174793e-06, + "loss": 0.5004, + "step": 681 + }, + { + "epoch": 1.6208742194469223, + "grad_norm": 0.6494477987289429, + "learning_rate": 8.392390011890608e-06, + "loss": 0.4494, + "step": 682 + }, + { + "epoch": 1.6232530478739222, + "grad_norm": 0.7831568717956543, + "learning_rate": 8.390011890606422e-06, + "loss": 0.4905, + "step": 683 + }, + { + "epoch": 1.6256318763009219, + "grad_norm": 0.8018678426742554, + "learning_rate": 8.387633769322236e-06, + "loss": 0.5087, + "step": 684 + }, + { + "epoch": 1.6280107047279215, + "grad_norm": 0.8098955750465393, + "learning_rate": 8.385255648038051e-06, + "loss": 0.5563, + "step": 685 + }, + { + "epoch": 1.6303895331549212, + "grad_norm": 0.7099370360374451, + "learning_rate": 8.382877526753865e-06, + "loss": 0.4443, + "step": 686 + }, + { + "epoch": 1.6327683615819208, + "grad_norm": 0.7208690643310547, + "learning_rate": 8.38049940546968e-06, + "loss": 0.5106, + "step": 687 + }, + { + "epoch": 1.6351471900089205, + "grad_norm": 0.8040987253189087, + "learning_rate": 8.378121284185494e-06, + "loss": 0.6185, + "step": 688 + }, + { + "epoch": 1.6375260184359202, + "grad_norm": 0.7410775423049927, + "learning_rate": 8.375743162901308e-06, + "loss": 0.3994, + "step": 689 + }, + { + "epoch": 1.63990484686292, + "grad_norm": 0.7176697850227356, + "learning_rate": 8.373365041617124e-06, + "loss": 0.4808, + "step": 690 + }, + { + "epoch": 1.6422836752899197, + "grad_norm": 0.6124643683433533, + "learning_rate": 8.370986920332937e-06, + "loss": 0.4433, + "step": 691 + }, + { + "epoch": 1.6446625037169196, + "grad_norm": 0.7167659401893616, + "learning_rate": 8.368608799048753e-06, + "loss": 0.5048, + "step": 692 + }, + { + "epoch": 1.6470413321439192, + "grad_norm": 0.8964076042175293, + "learning_rate": 8.366230677764567e-06, + "loss": 0.5479, + "step": 693 + }, + { + "epoch": 1.649420160570919, + "grad_norm": 0.964664876461029, + "learning_rate": 8.36385255648038e-06, + "loss": 0.6823, + "step": 694 + }, + { + "epoch": 1.6517989889979185, + "grad_norm": 0.7884921431541443, + "learning_rate": 8.361474435196196e-06, + "loss": 0.505, + "step": 695 + }, + { + "epoch": 1.6541778174249182, + "grad_norm": 0.646920382976532, + "learning_rate": 8.359096313912011e-06, + "loss": 0.4368, + "step": 696 + }, + { + "epoch": 1.6565566458519179, + "grad_norm": 0.7565163373947144, + "learning_rate": 8.356718192627825e-06, + "loss": 0.5992, + "step": 697 + }, + { + "epoch": 1.6589354742789175, + "grad_norm": 0.636759340763092, + "learning_rate": 8.35434007134364e-06, + "loss": 0.4477, + "step": 698 + }, + { + "epoch": 1.6613143027059172, + "grad_norm": 0.715076208114624, + "learning_rate": 8.351961950059454e-06, + "loss": 0.4642, + "step": 699 + }, + { + "epoch": 1.663693131132917, + "grad_norm": 0.6617520451545715, + "learning_rate": 8.349583828775268e-06, + "loss": 0.519, + "step": 700 + }, + { + "epoch": 1.663693131132917, + "eval_loss": 0.4823816120624542, + "eval_runtime": 24.1209, + "eval_samples_per_second": 31.01, + "eval_steps_per_second": 15.505, + "step": 700 + }, + { + "epoch": 1.6660719595599167, + "grad_norm": 0.7534847259521484, + "learning_rate": 8.347205707491083e-06, + "loss": 0.4453, + "step": 701 + }, + { + "epoch": 1.6684507879869166, + "grad_norm": 0.7491120100021362, + "learning_rate": 8.344827586206897e-06, + "loss": 0.4943, + "step": 702 + }, + { + "epoch": 1.6708296164139163, + "grad_norm": 0.7370908856391907, + "learning_rate": 8.342449464922711e-06, + "loss": 0.4301, + "step": 703 + }, + { + "epoch": 1.673208444840916, + "grad_norm": 0.6885918974876404, + "learning_rate": 8.340071343638526e-06, + "loss": 0.4412, + "step": 704 + }, + { + "epoch": 1.6755872732679156, + "grad_norm": 0.7213874459266663, + "learning_rate": 8.33769322235434e-06, + "loss": 0.4605, + "step": 705 + }, + { + "epoch": 1.6779661016949152, + "grad_norm": 0.6650336384773254, + "learning_rate": 8.335315101070156e-06, + "loss": 0.4301, + "step": 706 + }, + { + "epoch": 1.6803449301219149, + "grad_norm": 0.7063350081443787, + "learning_rate": 8.33293697978597e-06, + "loss": 0.4615, + "step": 707 + }, + { + "epoch": 1.6827237585489145, + "grad_norm": 0.7506964802742004, + "learning_rate": 8.330558858501783e-06, + "loss": 0.4122, + "step": 708 + }, + { + "epoch": 1.6851025869759142, + "grad_norm": 0.7118714451789856, + "learning_rate": 8.328180737217599e-06, + "loss": 0.513, + "step": 709 + }, + { + "epoch": 1.687481415402914, + "grad_norm": 0.7860468029975891, + "learning_rate": 8.325802615933414e-06, + "loss": 0.4835, + "step": 710 + }, + { + "epoch": 1.6898602438299137, + "grad_norm": 0.6780351996421814, + "learning_rate": 8.323424494649228e-06, + "loss": 0.4352, + "step": 711 + }, + { + "epoch": 1.6922390722569136, + "grad_norm": 0.702238142490387, + "learning_rate": 8.321046373365043e-06, + "loss": 0.4874, + "step": 712 + }, + { + "epoch": 1.6946179006839133, + "grad_norm": 0.8472846150398254, + "learning_rate": 8.318668252080857e-06, + "loss": 0.4987, + "step": 713 + }, + { + "epoch": 1.696996729110913, + "grad_norm": 0.821767270565033, + "learning_rate": 8.31629013079667e-06, + "loss": 0.5624, + "step": 714 + }, + { + "epoch": 1.6993755575379126, + "grad_norm": 0.7407257556915283, + "learning_rate": 8.313912009512486e-06, + "loss": 0.4574, + "step": 715 + }, + { + "epoch": 1.7017543859649122, + "grad_norm": 0.7512819766998291, + "learning_rate": 8.3115338882283e-06, + "loss": 0.4704, + "step": 716 + }, + { + "epoch": 1.704133214391912, + "grad_norm": 0.7993274331092834, + "learning_rate": 8.309155766944115e-06, + "loss": 0.4471, + "step": 717 + }, + { + "epoch": 1.7065120428189116, + "grad_norm": 0.7007825970649719, + "learning_rate": 8.30677764565993e-06, + "loss": 0.5094, + "step": 718 + }, + { + "epoch": 1.7088908712459114, + "grad_norm": 0.723167359828949, + "learning_rate": 8.304399524375743e-06, + "loss": 0.4419, + "step": 719 + }, + { + "epoch": 1.711269699672911, + "grad_norm": 0.7736985683441162, + "learning_rate": 8.302021403091558e-06, + "loss": 0.5197, + "step": 720 + }, + { + "epoch": 1.7136485280999108, + "grad_norm": 0.6698117256164551, + "learning_rate": 8.299643281807372e-06, + "loss": 0.4535, + "step": 721 + }, + { + "epoch": 1.7160273565269106, + "grad_norm": 0.8475157022476196, + "learning_rate": 8.297265160523188e-06, + "loss": 0.5357, + "step": 722 + }, + { + "epoch": 1.7184061849539103, + "grad_norm": 0.7494747638702393, + "learning_rate": 8.294887039239003e-06, + "loss": 0.4198, + "step": 723 + }, + { + "epoch": 1.72078501338091, + "grad_norm": 0.7288322448730469, + "learning_rate": 8.292508917954817e-06, + "loss": 0.5022, + "step": 724 + }, + { + "epoch": 1.7231638418079096, + "grad_norm": 0.7320839762687683, + "learning_rate": 8.29013079667063e-06, + "loss": 0.5074, + "step": 725 + }, + { + "epoch": 1.7255426702349093, + "grad_norm": 0.6906112432479858, + "learning_rate": 8.287752675386446e-06, + "loss": 0.4287, + "step": 726 + }, + { + "epoch": 1.727921498661909, + "grad_norm": 0.717533528804779, + "learning_rate": 8.28537455410226e-06, + "loss": 0.4053, + "step": 727 + }, + { + "epoch": 1.7303003270889086, + "grad_norm": 0.757537841796875, + "learning_rate": 8.282996432818075e-06, + "loss": 0.4812, + "step": 728 + }, + { + "epoch": 1.7326791555159085, + "grad_norm": 0.7593153119087219, + "learning_rate": 8.280618311533889e-06, + "loss": 0.4908, + "step": 729 + }, + { + "epoch": 1.735057983942908, + "grad_norm": 0.7367984652519226, + "learning_rate": 8.278240190249703e-06, + "loss": 0.3914, + "step": 730 + }, + { + "epoch": 1.7374368123699078, + "grad_norm": 0.7257245182991028, + "learning_rate": 8.275862068965518e-06, + "loss": 0.4892, + "step": 731 + }, + { + "epoch": 1.7398156407969076, + "grad_norm": 0.6449756622314453, + "learning_rate": 8.273483947681332e-06, + "loss": 0.3835, + "step": 732 + }, + { + "epoch": 1.7421944692239073, + "grad_norm": 0.7567237615585327, + "learning_rate": 8.271105826397146e-06, + "loss": 0.4237, + "step": 733 + }, + { + "epoch": 1.744573297650907, + "grad_norm": 0.7595177292823792, + "learning_rate": 8.268727705112961e-06, + "loss": 0.5087, + "step": 734 + }, + { + "epoch": 1.7469521260779066, + "grad_norm": 0.7001402378082275, + "learning_rate": 8.266349583828775e-06, + "loss": 0.4222, + "step": 735 + }, + { + "epoch": 1.7493309545049063, + "grad_norm": 0.6895666718482971, + "learning_rate": 8.26397146254459e-06, + "loss": 0.432, + "step": 736 + }, + { + "epoch": 1.751709782931906, + "grad_norm": 0.7280940413475037, + "learning_rate": 8.261593341260406e-06, + "loss": 0.469, + "step": 737 + }, + { + "epoch": 1.7540886113589056, + "grad_norm": 0.7261313199996948, + "learning_rate": 8.25921521997622e-06, + "loss": 0.527, + "step": 738 + }, + { + "epoch": 1.7564674397859055, + "grad_norm": 0.7156343460083008, + "learning_rate": 8.256837098692034e-06, + "loss": 0.4058, + "step": 739 + }, + { + "epoch": 1.7588462682129051, + "grad_norm": 0.677166223526001, + "learning_rate": 8.254458977407849e-06, + "loss": 0.5109, + "step": 740 + }, + { + "epoch": 1.761225096639905, + "grad_norm": 0.7175779342651367, + "learning_rate": 8.252080856123663e-06, + "loss": 0.4518, + "step": 741 + }, + { + "epoch": 1.7636039250669047, + "grad_norm": 0.7992874979972839, + "learning_rate": 8.249702734839478e-06, + "loss": 0.4813, + "step": 742 + }, + { + "epoch": 1.7659827534939043, + "grad_norm": 0.7325806021690369, + "learning_rate": 8.247324613555292e-06, + "loss": 0.4308, + "step": 743 + }, + { + "epoch": 1.768361581920904, + "grad_norm": 0.7240017652511597, + "learning_rate": 8.244946492271106e-06, + "loss": 0.4553, + "step": 744 + }, + { + "epoch": 1.7707404103479036, + "grad_norm": 0.7299623489379883, + "learning_rate": 8.242568370986921e-06, + "loss": 0.5344, + "step": 745 + }, + { + "epoch": 1.7731192387749033, + "grad_norm": 0.6622987389564514, + "learning_rate": 8.240190249702735e-06, + "loss": 0.4311, + "step": 746 + }, + { + "epoch": 1.775498067201903, + "grad_norm": 0.7485376596450806, + "learning_rate": 8.23781212841855e-06, + "loss": 0.4835, + "step": 747 + }, + { + "epoch": 1.7778768956289026, + "grad_norm": 0.7301716208457947, + "learning_rate": 8.235434007134364e-06, + "loss": 0.4507, + "step": 748 + }, + { + "epoch": 1.7802557240559025, + "grad_norm": 0.7836865186691284, + "learning_rate": 8.233055885850178e-06, + "loss": 0.5503, + "step": 749 + }, + { + "epoch": 1.7826345524829021, + "grad_norm": 0.7640783786773682, + "learning_rate": 8.230677764565993e-06, + "loss": 0.4406, + "step": 750 + }, + { + "epoch": 1.7826345524829021, + "eval_loss": 0.47640714049339294, + "eval_runtime": 23.0318, + "eval_samples_per_second": 32.477, + "eval_steps_per_second": 16.238, + "step": 750 + }, + { + "epoch": 1.785013380909902, + "grad_norm": 0.6380470991134644, + "learning_rate": 8.228299643281809e-06, + "loss": 0.4483, + "step": 751 + }, + { + "epoch": 1.7873922093369017, + "grad_norm": 0.7657719850540161, + "learning_rate": 8.225921521997623e-06, + "loss": 0.5004, + "step": 752 + }, + { + "epoch": 1.7897710377639013, + "grad_norm": 0.6736970543861389, + "learning_rate": 8.223543400713438e-06, + "loss": 0.476, + "step": 753 + }, + { + "epoch": 1.792149866190901, + "grad_norm": 0.7384494543075562, + "learning_rate": 8.221165279429252e-06, + "loss": 0.461, + "step": 754 + }, + { + "epoch": 1.7945286946179007, + "grad_norm": 0.7509880065917969, + "learning_rate": 8.218787158145066e-06, + "loss": 0.4403, + "step": 755 + }, + { + "epoch": 1.7969075230449003, + "grad_norm": 0.7790508270263672, + "learning_rate": 8.216409036860881e-06, + "loss": 0.4821, + "step": 756 + }, + { + "epoch": 1.7992863514719, + "grad_norm": 0.7109124064445496, + "learning_rate": 8.214030915576695e-06, + "loss": 0.518, + "step": 757 + }, + { + "epoch": 1.8016651798988996, + "grad_norm": 0.72182297706604, + "learning_rate": 8.21165279429251e-06, + "loss": 0.4511, + "step": 758 + }, + { + "epoch": 1.8040440083258995, + "grad_norm": 0.6887755990028381, + "learning_rate": 8.209274673008324e-06, + "loss": 0.3566, + "step": 759 + }, + { + "epoch": 1.8064228367528992, + "grad_norm": 0.6632135510444641, + "learning_rate": 8.206896551724138e-06, + "loss": 0.3771, + "step": 760 + }, + { + "epoch": 1.808801665179899, + "grad_norm": 0.7614812850952148, + "learning_rate": 8.204518430439953e-06, + "loss": 0.4663, + "step": 761 + }, + { + "epoch": 1.8111804936068987, + "grad_norm": 0.7324971556663513, + "learning_rate": 8.202140309155767e-06, + "loss": 0.4138, + "step": 762 + }, + { + "epoch": 1.8135593220338984, + "grad_norm": 0.6464930772781372, + "learning_rate": 8.19976218787158e-06, + "loss": 0.4261, + "step": 763 + }, + { + "epoch": 1.815938150460898, + "grad_norm": 0.8067362308502197, + "learning_rate": 8.197384066587398e-06, + "loss": 0.4638, + "step": 764 + }, + { + "epoch": 1.8183169788878977, + "grad_norm": 0.8238150477409363, + "learning_rate": 8.195005945303212e-06, + "loss": 0.4514, + "step": 765 + }, + { + "epoch": 1.8206958073148973, + "grad_norm": 0.7046187520027161, + "learning_rate": 8.192627824019025e-06, + "loss": 0.4269, + "step": 766 + }, + { + "epoch": 1.823074635741897, + "grad_norm": 0.7702739238739014, + "learning_rate": 8.190249702734841e-06, + "loss": 0.4731, + "step": 767 + }, + { + "epoch": 1.8254534641688969, + "grad_norm": 0.75829017162323, + "learning_rate": 8.187871581450655e-06, + "loss": 0.4173, + "step": 768 + }, + { + "epoch": 1.8278322925958965, + "grad_norm": 0.7413159012794495, + "learning_rate": 8.185493460166468e-06, + "loss": 0.4981, + "step": 769 + }, + { + "epoch": 1.8302111210228962, + "grad_norm": 0.770672082901001, + "learning_rate": 8.183115338882284e-06, + "loss": 0.4018, + "step": 770 + }, + { + "epoch": 1.832589949449896, + "grad_norm": 0.7558537721633911, + "learning_rate": 8.180737217598098e-06, + "loss": 0.4571, + "step": 771 + }, + { + "epoch": 1.8349687778768957, + "grad_norm": 0.7780984044075012, + "learning_rate": 8.178359096313913e-06, + "loss": 0.4377, + "step": 772 + }, + { + "epoch": 1.8373476063038954, + "grad_norm": 0.8196839094161987, + "learning_rate": 8.175980975029727e-06, + "loss": 0.415, + "step": 773 + }, + { + "epoch": 1.839726434730895, + "grad_norm": 0.8161338567733765, + "learning_rate": 8.17360285374554e-06, + "loss": 0.4519, + "step": 774 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.6189297437667847, + "learning_rate": 8.171224732461356e-06, + "loss": 0.4033, + "step": 775 + }, + { + "epoch": 1.8444840915848943, + "grad_norm": 0.8346306681632996, + "learning_rate": 8.16884661117717e-06, + "loss": 0.5243, + "step": 776 + }, + { + "epoch": 1.846862920011894, + "grad_norm": 0.7419246435165405, + "learning_rate": 8.166468489892985e-06, + "loss": 0.43, + "step": 777 + }, + { + "epoch": 1.8492417484388939, + "grad_norm": 0.8460903167724609, + "learning_rate": 8.1640903686088e-06, + "loss": 0.571, + "step": 778 + }, + { + "epoch": 1.8516205768658935, + "grad_norm": 0.7926262617111206, + "learning_rate": 8.161712247324615e-06, + "loss": 0.5044, + "step": 779 + }, + { + "epoch": 1.8539994052928932, + "grad_norm": 0.7315496206283569, + "learning_rate": 8.159334126040428e-06, + "loss": 0.4564, + "step": 780 + }, + { + "epoch": 1.856378233719893, + "grad_norm": 0.8901942372322083, + "learning_rate": 8.156956004756244e-06, + "loss": 0.485, + "step": 781 + }, + { + "epoch": 1.8587570621468927, + "grad_norm": 0.7715483903884888, + "learning_rate": 8.154577883472058e-06, + "loss": 0.5152, + "step": 782 + }, + { + "epoch": 1.8611358905738924, + "grad_norm": 0.7323207855224609, + "learning_rate": 8.152199762187873e-06, + "loss": 0.3664, + "step": 783 + }, + { + "epoch": 1.863514719000892, + "grad_norm": 0.6117688417434692, + "learning_rate": 8.149821640903687e-06, + "loss": 0.2979, + "step": 784 + }, + { + "epoch": 1.8658935474278917, + "grad_norm": 0.7089108824729919, + "learning_rate": 8.1474435196195e-06, + "loss": 0.4314, + "step": 785 + }, + { + "epoch": 1.8682723758548914, + "grad_norm": 0.9341006278991699, + "learning_rate": 8.145065398335316e-06, + "loss": 0.5663, + "step": 786 + }, + { + "epoch": 1.870651204281891, + "grad_norm": 0.9197142124176025, + "learning_rate": 8.14268727705113e-06, + "loss": 0.4584, + "step": 787 + }, + { + "epoch": 1.873030032708891, + "grad_norm": 0.7007054090499878, + "learning_rate": 8.140309155766945e-06, + "loss": 0.4022, + "step": 788 + }, + { + "epoch": 1.8754088611358906, + "grad_norm": 0.7627575993537903, + "learning_rate": 8.137931034482759e-06, + "loss": 0.4276, + "step": 789 + }, + { + "epoch": 1.8777876895628904, + "grad_norm": 0.864920973777771, + "learning_rate": 8.135552913198573e-06, + "loss": 0.5177, + "step": 790 + }, + { + "epoch": 1.88016651798989, + "grad_norm": 0.6417273879051208, + "learning_rate": 8.133174791914388e-06, + "loss": 0.3893, + "step": 791 + }, + { + "epoch": 1.8825453464168898, + "grad_norm": 0.850860059261322, + "learning_rate": 8.130796670630204e-06, + "loss": 0.5604, + "step": 792 + }, + { + "epoch": 1.8849241748438894, + "grad_norm": 0.6767653822898865, + "learning_rate": 8.128418549346017e-06, + "loss": 0.3879, + "step": 793 + }, + { + "epoch": 1.887303003270889, + "grad_norm": 0.7524675130844116, + "learning_rate": 8.126040428061833e-06, + "loss": 0.424, + "step": 794 + }, + { + "epoch": 1.8896818316978887, + "grad_norm": 0.7639425992965698, + "learning_rate": 8.123662306777647e-06, + "loss": 0.4835, + "step": 795 + }, + { + "epoch": 1.8920606601248884, + "grad_norm": 0.8532370328903198, + "learning_rate": 8.12128418549346e-06, + "loss": 0.5434, + "step": 796 + }, + { + "epoch": 1.894439488551888, + "grad_norm": 0.7462890148162842, + "learning_rate": 8.118906064209276e-06, + "loss": 0.4373, + "step": 797 + }, + { + "epoch": 1.896818316978888, + "grad_norm": 0.7041010856628418, + "learning_rate": 8.11652794292509e-06, + "loss": 0.3591, + "step": 798 + }, + { + "epoch": 1.8991971454058876, + "grad_norm": 0.7861014008522034, + "learning_rate": 8.114149821640905e-06, + "loss": 0.4908, + "step": 799 + }, + { + "epoch": 1.9015759738328875, + "grad_norm": 0.7501910328865051, + "learning_rate": 8.111771700356719e-06, + "loss": 0.473, + "step": 800 + }, + { + "epoch": 1.9015759738328875, + "eval_loss": 0.47231346368789673, + "eval_runtime": 23.113, + "eval_samples_per_second": 32.363, + "eval_steps_per_second": 16.181, + "step": 800 + }, + { + "epoch": 1.9039548022598871, + "grad_norm": 0.7369624972343445, + "learning_rate": 8.109393579072533e-06, + "loss": 0.4122, + "step": 801 + }, + { + "epoch": 1.9063336306868868, + "grad_norm": 0.7466087937355042, + "learning_rate": 8.107015457788348e-06, + "loss": 0.4243, + "step": 802 + }, + { + "epoch": 1.9087124591138864, + "grad_norm": 0.701115608215332, + "learning_rate": 8.104637336504162e-06, + "loss": 0.4236, + "step": 803 + }, + { + "epoch": 1.911091287540886, + "grad_norm": 0.7000012397766113, + "learning_rate": 8.102259215219976e-06, + "loss": 0.4334, + "step": 804 + }, + { + "epoch": 1.9134701159678857, + "grad_norm": 0.7121322154998779, + "learning_rate": 8.099881093935793e-06, + "loss": 0.4162, + "step": 805 + }, + { + "epoch": 1.9158489443948854, + "grad_norm": 0.854978084564209, + "learning_rate": 8.097502972651607e-06, + "loss": 0.4894, + "step": 806 + }, + { + "epoch": 1.918227772821885, + "grad_norm": 0.806287944316864, + "learning_rate": 8.09512485136742e-06, + "loss": 0.4842, + "step": 807 + }, + { + "epoch": 1.920606601248885, + "grad_norm": 0.6621392369270325, + "learning_rate": 8.092746730083236e-06, + "loss": 0.4504, + "step": 808 + }, + { + "epoch": 1.9229854296758846, + "grad_norm": 0.7524036169052124, + "learning_rate": 8.09036860879905e-06, + "loss": 0.5002, + "step": 809 + }, + { + "epoch": 1.9253642581028845, + "grad_norm": 0.9552518129348755, + "learning_rate": 8.087990487514863e-06, + "loss": 0.4751, + "step": 810 + }, + { + "epoch": 1.9277430865298841, + "grad_norm": 0.684811532497406, + "learning_rate": 8.085612366230679e-06, + "loss": 0.4378, + "step": 811 + }, + { + "epoch": 1.9301219149568838, + "grad_norm": 0.7821682095527649, + "learning_rate": 8.083234244946492e-06, + "loss": 0.4023, + "step": 812 + }, + { + "epoch": 1.9325007433838834, + "grad_norm": 0.7470305562019348, + "learning_rate": 8.080856123662308e-06, + "loss": 0.4068, + "step": 813 + }, + { + "epoch": 1.934879571810883, + "grad_norm": 0.8974653482437134, + "learning_rate": 8.078478002378122e-06, + "loss": 0.5893, + "step": 814 + }, + { + "epoch": 1.9372584002378828, + "grad_norm": 0.8154368996620178, + "learning_rate": 8.076099881093935e-06, + "loss": 0.4178, + "step": 815 + }, + { + "epoch": 1.9396372286648824, + "grad_norm": 0.7278023362159729, + "learning_rate": 8.073721759809751e-06, + "loss": 0.4113, + "step": 816 + }, + { + "epoch": 1.9420160570918823, + "grad_norm": 0.7569306492805481, + "learning_rate": 8.071343638525565e-06, + "loss": 0.4363, + "step": 817 + }, + { + "epoch": 1.944394885518882, + "grad_norm": 0.8730507493019104, + "learning_rate": 8.06896551724138e-06, + "loss": 0.4375, + "step": 818 + }, + { + "epoch": 1.9467737139458816, + "grad_norm": 0.7905109524726868, + "learning_rate": 8.066587395957196e-06, + "loss": 0.468, + "step": 819 + }, + { + "epoch": 1.9491525423728815, + "grad_norm": 0.7219825387001038, + "learning_rate": 8.06420927467301e-06, + "loss": 0.4806, + "step": 820 + }, + { + "epoch": 1.9515313707998811, + "grad_norm": 0.8626275062561035, + "learning_rate": 8.061831153388823e-06, + "loss": 0.4454, + "step": 821 + }, + { + "epoch": 1.9539101992268808, + "grad_norm": 0.7823079228401184, + "learning_rate": 8.059453032104639e-06, + "loss": 0.4504, + "step": 822 + }, + { + "epoch": 1.9562890276538805, + "grad_norm": 0.7249975800514221, + "learning_rate": 8.057074910820452e-06, + "loss": 0.4597, + "step": 823 + }, + { + "epoch": 1.9586678560808801, + "grad_norm": 0.8110474944114685, + "learning_rate": 8.054696789536268e-06, + "loss": 0.4758, + "step": 824 + }, + { + "epoch": 1.9610466845078798, + "grad_norm": 0.8278000354766846, + "learning_rate": 8.052318668252082e-06, + "loss": 0.5194, + "step": 825 + }, + { + "epoch": 1.9634255129348794, + "grad_norm": 0.8359079360961914, + "learning_rate": 8.049940546967895e-06, + "loss": 0.5727, + "step": 826 + }, + { + "epoch": 1.9658043413618793, + "grad_norm": 0.6099646091461182, + "learning_rate": 8.04756242568371e-06, + "loss": 0.3531, + "step": 827 + }, + { + "epoch": 1.968183169788879, + "grad_norm": 0.843914270401001, + "learning_rate": 8.045184304399525e-06, + "loss": 0.4703, + "step": 828 + }, + { + "epoch": 1.9705619982158786, + "grad_norm": 0.7643455266952515, + "learning_rate": 8.04280618311534e-06, + "loss": 0.4282, + "step": 829 + }, + { + "epoch": 1.9729408266428785, + "grad_norm": 0.7639408707618713, + "learning_rate": 8.040428061831154e-06, + "loss": 0.4229, + "step": 830 + }, + { + "epoch": 1.9753196550698782, + "grad_norm": 0.6471571922302246, + "learning_rate": 8.038049940546968e-06, + "loss": 0.4413, + "step": 831 + }, + { + "epoch": 1.9776984834968778, + "grad_norm": 0.8547807335853577, + "learning_rate": 8.035671819262783e-06, + "loss": 0.5527, + "step": 832 + }, + { + "epoch": 1.9800773119238775, + "grad_norm": 0.7524789571762085, + "learning_rate": 8.033293697978598e-06, + "loss": 0.4913, + "step": 833 + }, + { + "epoch": 1.9824561403508771, + "grad_norm": 0.763403058052063, + "learning_rate": 8.030915576694412e-06, + "loss": 0.4297, + "step": 834 + }, + { + "epoch": 1.9848349687778768, + "grad_norm": 0.8133661150932312, + "learning_rate": 8.028537455410228e-06, + "loss": 0.4736, + "step": 835 + }, + { + "epoch": 1.9872137972048765, + "grad_norm": 0.6344509124755859, + "learning_rate": 8.026159334126041e-06, + "loss": 0.3952, + "step": 836 + }, + { + "epoch": 1.9895926256318763, + "grad_norm": 0.7125160694122314, + "learning_rate": 8.023781212841855e-06, + "loss": 0.4295, + "step": 837 + }, + { + "epoch": 1.991971454058876, + "grad_norm": 0.7310596108436584, + "learning_rate": 8.02140309155767e-06, + "loss": 0.4009, + "step": 838 + }, + { + "epoch": 1.9943502824858759, + "grad_norm": 0.7372251152992249, + "learning_rate": 8.019024970273484e-06, + "loss": 0.4511, + "step": 839 + }, + { + "epoch": 1.9967291109128755, + "grad_norm": 0.6241785287857056, + "learning_rate": 8.016646848989298e-06, + "loss": 0.4283, + "step": 840 + }, + { + "epoch": 1.9991079393398752, + "grad_norm": 0.7285897731781006, + "learning_rate": 8.014268727705114e-06, + "loss": 0.4959, + "step": 841 + }, + { + "epoch": 2.0, + "grad_norm": 1.345123291015625, + "learning_rate": 8.011890606420927e-06, + "loss": 0.4455, + "step": 842 + }, + { + "epoch": 2.0023788284269997, + "grad_norm": 0.8226174116134644, + "learning_rate": 8.009512485136743e-06, + "loss": 0.4564, + "step": 843 + }, + { + "epoch": 2.0047576568539993, + "grad_norm": 0.6699106097221375, + "learning_rate": 8.007134363852557e-06, + "loss": 0.4863, + "step": 844 + }, + { + "epoch": 2.007136485280999, + "grad_norm": 0.6968250870704651, + "learning_rate": 8.00475624256837e-06, + "loss": 0.4396, + "step": 845 + }, + { + "epoch": 2.0095153137079986, + "grad_norm": 0.8628593683242798, + "learning_rate": 8.002378121284186e-06, + "loss": 0.4797, + "step": 846 + }, + { + "epoch": 2.0118941421349987, + "grad_norm": 0.8205950856208801, + "learning_rate": 8.000000000000001e-06, + "loss": 0.516, + "step": 847 + }, + { + "epoch": 2.0142729705619984, + "grad_norm": 0.8094960451126099, + "learning_rate": 7.997621878715815e-06, + "loss": 0.4018, + "step": 848 + }, + { + "epoch": 2.016651798988998, + "grad_norm": 0.7943852543830872, + "learning_rate": 7.99524375743163e-06, + "loss": 0.461, + "step": 849 + }, + { + "epoch": 2.0190306274159977, + "grad_norm": 0.7668120265007019, + "learning_rate": 7.992865636147444e-06, + "loss": 0.4019, + "step": 850 + }, + { + "epoch": 2.0190306274159977, + "eval_loss": 0.46766459941864014, + "eval_runtime": 23.8335, + "eval_samples_per_second": 31.384, + "eval_steps_per_second": 15.692, + "step": 850 + }, + { + "epoch": 2.0214094558429974, + "grad_norm": 0.7893855571746826, + "learning_rate": 7.990487514863258e-06, + "loss": 0.4385, + "step": 851 + }, + { + "epoch": 2.023788284269997, + "grad_norm": 0.7579034566879272, + "learning_rate": 7.988109393579074e-06, + "loss": 0.3582, + "step": 852 + }, + { + "epoch": 2.0261671126969967, + "grad_norm": 0.8687305450439453, + "learning_rate": 7.985731272294887e-06, + "loss": 0.4546, + "step": 853 + }, + { + "epoch": 2.0285459411239963, + "grad_norm": 0.8024737238883972, + "learning_rate": 7.983353151010703e-06, + "loss": 0.4491, + "step": 854 + }, + { + "epoch": 2.030924769550996, + "grad_norm": 0.8213666677474976, + "learning_rate": 7.980975029726517e-06, + "loss": 0.4727, + "step": 855 + }, + { + "epoch": 2.0333035979779956, + "grad_norm": 0.7788676619529724, + "learning_rate": 7.97859690844233e-06, + "loss": 0.5308, + "step": 856 + }, + { + "epoch": 2.0356824264049957, + "grad_norm": 0.8892275094985962, + "learning_rate": 7.976218787158146e-06, + "loss": 0.4728, + "step": 857 + }, + { + "epoch": 2.0380612548319954, + "grad_norm": 0.810043215751648, + "learning_rate": 7.97384066587396e-06, + "loss": 0.4525, + "step": 858 + }, + { + "epoch": 2.040440083258995, + "grad_norm": 0.8676638007164001, + "learning_rate": 7.971462544589775e-06, + "loss": 0.453, + "step": 859 + }, + { + "epoch": 2.0428189116859947, + "grad_norm": 0.840715229511261, + "learning_rate": 7.96908442330559e-06, + "loss": 0.491, + "step": 860 + }, + { + "epoch": 2.0451977401129944, + "grad_norm": 0.7979345321655273, + "learning_rate": 7.966706302021404e-06, + "loss": 0.5168, + "step": 861 + }, + { + "epoch": 2.047576568539994, + "grad_norm": 0.7907995581626892, + "learning_rate": 7.964328180737218e-06, + "loss": 0.3946, + "step": 862 + }, + { + "epoch": 2.0499553969669937, + "grad_norm": 0.7351325154304504, + "learning_rate": 7.961950059453033e-06, + "loss": 0.3901, + "step": 863 + }, + { + "epoch": 2.0523342253939933, + "grad_norm": 0.7891148924827576, + "learning_rate": 7.959571938168847e-06, + "loss": 0.4215, + "step": 864 + }, + { + "epoch": 2.054713053820993, + "grad_norm": 0.7307685613632202, + "learning_rate": 7.957193816884663e-06, + "loss": 0.4083, + "step": 865 + }, + { + "epoch": 2.0570918822479927, + "grad_norm": 0.7875308990478516, + "learning_rate": 7.954815695600476e-06, + "loss": 0.4587, + "step": 866 + }, + { + "epoch": 2.0594707106749928, + "grad_norm": 0.682457447052002, + "learning_rate": 7.95243757431629e-06, + "loss": 0.4193, + "step": 867 + }, + { + "epoch": 2.0618495391019924, + "grad_norm": 0.7338114976882935, + "learning_rate": 7.950059453032106e-06, + "loss": 0.4129, + "step": 868 + }, + { + "epoch": 2.064228367528992, + "grad_norm": 0.7637101411819458, + "learning_rate": 7.94768133174792e-06, + "loss": 0.4083, + "step": 869 + }, + { + "epoch": 2.0666071959559917, + "grad_norm": 0.7745689749717712, + "learning_rate": 7.945303210463733e-06, + "loss": 0.4502, + "step": 870 + }, + { + "epoch": 2.0689860243829914, + "grad_norm": 0.7925781011581421, + "learning_rate": 7.942925089179549e-06, + "loss": 0.4061, + "step": 871 + }, + { + "epoch": 2.071364852809991, + "grad_norm": 0.6674710512161255, + "learning_rate": 7.940546967895362e-06, + "loss": 0.4038, + "step": 872 + }, + { + "epoch": 2.0737436812369907, + "grad_norm": 0.7587394714355469, + "learning_rate": 7.938168846611178e-06, + "loss": 0.4197, + "step": 873 + }, + { + "epoch": 2.0761225096639904, + "grad_norm": 0.6567443609237671, + "learning_rate": 7.935790725326993e-06, + "loss": 0.392, + "step": 874 + }, + { + "epoch": 2.07850133809099, + "grad_norm": 0.7451093196868896, + "learning_rate": 7.933412604042807e-06, + "loss": 0.5078, + "step": 875 + }, + { + "epoch": 2.0808801665179897, + "grad_norm": 0.8137801289558411, + "learning_rate": 7.93103448275862e-06, + "loss": 0.4863, + "step": 876 + }, + { + "epoch": 2.08325899494499, + "grad_norm": 0.7367770075798035, + "learning_rate": 7.928656361474436e-06, + "loss": 0.4536, + "step": 877 + }, + { + "epoch": 2.0856378233719894, + "grad_norm": 0.7012405395507812, + "learning_rate": 7.92627824019025e-06, + "loss": 0.3474, + "step": 878 + }, + { + "epoch": 2.088016651798989, + "grad_norm": 0.83682781457901, + "learning_rate": 7.923900118906065e-06, + "loss": 0.4634, + "step": 879 + }, + { + "epoch": 2.0903954802259888, + "grad_norm": 0.6622467637062073, + "learning_rate": 7.92152199762188e-06, + "loss": 0.4125, + "step": 880 + }, + { + "epoch": 2.0927743086529884, + "grad_norm": 0.7863491177558899, + "learning_rate": 7.919143876337693e-06, + "loss": 0.4203, + "step": 881 + }, + { + "epoch": 2.095153137079988, + "grad_norm": 0.8468984961509705, + "learning_rate": 7.916765755053508e-06, + "loss": 0.4445, + "step": 882 + }, + { + "epoch": 2.0975319655069877, + "grad_norm": 0.7020966410636902, + "learning_rate": 7.914387633769322e-06, + "loss": 0.4288, + "step": 883 + }, + { + "epoch": 2.0999107939339874, + "grad_norm": 0.7692680954933167, + "learning_rate": 7.912009512485138e-06, + "loss": 0.444, + "step": 884 + }, + { + "epoch": 2.102289622360987, + "grad_norm": 0.7451436519622803, + "learning_rate": 7.909631391200951e-06, + "loss": 0.4211, + "step": 885 + }, + { + "epoch": 2.1046684507879867, + "grad_norm": 0.821258008480072, + "learning_rate": 7.907253269916765e-06, + "loss": 0.4756, + "step": 886 + }, + { + "epoch": 2.107047279214987, + "grad_norm": 1.0395094156265259, + "learning_rate": 7.90487514863258e-06, + "loss": 0.5017, + "step": 887 + }, + { + "epoch": 2.1094261076419865, + "grad_norm": 0.883202850818634, + "learning_rate": 7.902497027348396e-06, + "loss": 0.4526, + "step": 888 + }, + { + "epoch": 2.111804936068986, + "grad_norm": 0.896237850189209, + "learning_rate": 7.90011890606421e-06, + "loss": 0.4437, + "step": 889 + }, + { + "epoch": 2.1141837644959858, + "grad_norm": 0.7526819109916687, + "learning_rate": 7.897740784780025e-06, + "loss": 0.5119, + "step": 890 + }, + { + "epoch": 2.1165625929229854, + "grad_norm": 0.7432401180267334, + "learning_rate": 7.895362663495839e-06, + "loss": 0.4054, + "step": 891 + }, + { + "epoch": 2.118941421349985, + "grad_norm": 0.7850770950317383, + "learning_rate": 7.892984542211653e-06, + "loss": 0.4176, + "step": 892 + }, + { + "epoch": 2.1213202497769847, + "grad_norm": 0.833646833896637, + "learning_rate": 7.890606420927468e-06, + "loss": 0.515, + "step": 893 + }, + { + "epoch": 2.1236990782039844, + "grad_norm": 0.8409990668296814, + "learning_rate": 7.888228299643282e-06, + "loss": 0.4481, + "step": 894 + }, + { + "epoch": 2.126077906630984, + "grad_norm": 0.8069798946380615, + "learning_rate": 7.885850178359098e-06, + "loss": 0.524, + "step": 895 + }, + { + "epoch": 2.128456735057984, + "grad_norm": 0.7600740790367126, + "learning_rate": 7.883472057074911e-06, + "loss": 0.3937, + "step": 896 + }, + { + "epoch": 2.130835563484984, + "grad_norm": 0.9579213261604309, + "learning_rate": 7.881093935790725e-06, + "loss": 0.5705, + "step": 897 + }, + { + "epoch": 2.1332143919119835, + "grad_norm": 0.6670339107513428, + "learning_rate": 7.87871581450654e-06, + "loss": 0.4213, + "step": 898 + }, + { + "epoch": 2.135593220338983, + "grad_norm": 0.7561557292938232, + "learning_rate": 7.876337693222354e-06, + "loss": 0.3978, + "step": 899 + }, + { + "epoch": 2.137972048765983, + "grad_norm": 0.740649402141571, + "learning_rate": 7.87395957193817e-06, + "loss": 0.4085, + "step": 900 + }, + { + "epoch": 2.137972048765983, + "eval_loss": 0.4641876220703125, + "eval_runtime": 24.0325, + "eval_samples_per_second": 31.125, + "eval_steps_per_second": 15.562, + "step": 900 + }, + { + "epoch": 2.1403508771929824, + "grad_norm": 0.7746867537498474, + "learning_rate": 7.871581450653985e-06, + "loss": 0.3508, + "step": 901 + }, + { + "epoch": 2.142729705619982, + "grad_norm": 0.8443162441253662, + "learning_rate": 7.869203329369799e-06, + "loss": 0.4803, + "step": 902 + }, + { + "epoch": 2.1451085340469818, + "grad_norm": 0.8279404640197754, + "learning_rate": 7.866825208085613e-06, + "loss": 0.4427, + "step": 903 + }, + { + "epoch": 2.1474873624739814, + "grad_norm": 0.7320528626441956, + "learning_rate": 7.864447086801428e-06, + "loss": 0.3763, + "step": 904 + }, + { + "epoch": 2.149866190900981, + "grad_norm": 0.7278544306755066, + "learning_rate": 7.862068965517242e-06, + "loss": 0.3739, + "step": 905 + }, + { + "epoch": 2.1522450193279807, + "grad_norm": 0.9622790217399597, + "learning_rate": 7.859690844233057e-06, + "loss": 0.404, + "step": 906 + }, + { + "epoch": 2.154623847754981, + "grad_norm": 0.8544254899024963, + "learning_rate": 7.857312722948871e-06, + "loss": 0.4139, + "step": 907 + }, + { + "epoch": 2.1570026761819805, + "grad_norm": 0.8915610909461975, + "learning_rate": 7.854934601664685e-06, + "loss": 0.4189, + "step": 908 + }, + { + "epoch": 2.15938150460898, + "grad_norm": 0.7949631214141846, + "learning_rate": 7.8525564803805e-06, + "loss": 0.4642, + "step": 909 + }, + { + "epoch": 2.16176033303598, + "grad_norm": 0.7947200536727905, + "learning_rate": 7.850178359096314e-06, + "loss": 0.433, + "step": 910 + }, + { + "epoch": 2.1641391614629795, + "grad_norm": 0.7428370714187622, + "learning_rate": 7.847800237812128e-06, + "loss": 0.3817, + "step": 911 + }, + { + "epoch": 2.166517989889979, + "grad_norm": 0.8625103235244751, + "learning_rate": 7.845422116527943e-06, + "loss": 0.4264, + "step": 912 + }, + { + "epoch": 2.168896818316979, + "grad_norm": 0.8226600289344788, + "learning_rate": 7.843043995243757e-06, + "loss": 0.4144, + "step": 913 + }, + { + "epoch": 2.1712756467439784, + "grad_norm": 0.8117079734802246, + "learning_rate": 7.840665873959573e-06, + "loss": 0.4888, + "step": 914 + }, + { + "epoch": 2.173654475170978, + "grad_norm": 0.8147917985916138, + "learning_rate": 7.838287752675388e-06, + "loss": 0.4256, + "step": 915 + }, + { + "epoch": 2.176033303597978, + "grad_norm": 0.9285522699356079, + "learning_rate": 7.835909631391202e-06, + "loss": 0.4483, + "step": 916 + }, + { + "epoch": 2.178412132024978, + "grad_norm": 0.8799831867218018, + "learning_rate": 7.833531510107016e-06, + "loss": 0.4337, + "step": 917 + }, + { + "epoch": 2.1807909604519775, + "grad_norm": 0.8417677283287048, + "learning_rate": 7.831153388822831e-06, + "loss": 0.4777, + "step": 918 + }, + { + "epoch": 2.183169788878977, + "grad_norm": 0.8338337540626526, + "learning_rate": 7.828775267538645e-06, + "loss": 0.4188, + "step": 919 + }, + { + "epoch": 2.185548617305977, + "grad_norm": 0.9104913473129272, + "learning_rate": 7.82639714625446e-06, + "loss": 0.4338, + "step": 920 + }, + { + "epoch": 2.1879274457329765, + "grad_norm": 0.9980224370956421, + "learning_rate": 7.824019024970274e-06, + "loss": 0.5415, + "step": 921 + }, + { + "epoch": 2.190306274159976, + "grad_norm": 0.7067880034446716, + "learning_rate": 7.821640903686088e-06, + "loss": 0.3728, + "step": 922 + }, + { + "epoch": 2.192685102586976, + "grad_norm": 0.8863831758499146, + "learning_rate": 7.819262782401903e-06, + "loss": 0.4424, + "step": 923 + }, + { + "epoch": 2.1950639310139755, + "grad_norm": 0.8814319372177124, + "learning_rate": 7.816884661117717e-06, + "loss": 0.5314, + "step": 924 + }, + { + "epoch": 2.197442759440975, + "grad_norm": 0.8519752025604248, + "learning_rate": 7.814506539833533e-06, + "loss": 0.4895, + "step": 925 + }, + { + "epoch": 2.199821587867975, + "grad_norm": 0.9418483376502991, + "learning_rate": 7.812128418549346e-06, + "loss": 0.4641, + "step": 926 + }, + { + "epoch": 2.202200416294975, + "grad_norm": 0.832245945930481, + "learning_rate": 7.80975029726516e-06, + "loss": 0.4501, + "step": 927 + }, + { + "epoch": 2.2045792447219745, + "grad_norm": 0.7454311847686768, + "learning_rate": 7.807372175980975e-06, + "loss": 0.4002, + "step": 928 + }, + { + "epoch": 2.206958073148974, + "grad_norm": 0.768144965171814, + "learning_rate": 7.804994054696791e-06, + "loss": 0.4493, + "step": 929 + }, + { + "epoch": 2.209336901575974, + "grad_norm": 0.8892447352409363, + "learning_rate": 7.802615933412605e-06, + "loss": 0.4512, + "step": 930 + }, + { + "epoch": 2.2117157300029735, + "grad_norm": 0.791732132434845, + "learning_rate": 7.80023781212842e-06, + "loss": 0.4198, + "step": 931 + }, + { + "epoch": 2.214094558429973, + "grad_norm": 1.0215604305267334, + "learning_rate": 7.797859690844234e-06, + "loss": 0.4984, + "step": 932 + }, + { + "epoch": 2.216473386856973, + "grad_norm": 0.8127293586730957, + "learning_rate": 7.795481569560048e-06, + "loss": 0.4232, + "step": 933 + }, + { + "epoch": 2.2188522152839725, + "grad_norm": 1.0035741329193115, + "learning_rate": 7.793103448275863e-06, + "loss": 0.5349, + "step": 934 + }, + { + "epoch": 2.221231043710972, + "grad_norm": 0.9261972904205322, + "learning_rate": 7.790725326991677e-06, + "loss": 0.3923, + "step": 935 + }, + { + "epoch": 2.2236098721379722, + "grad_norm": 0.7336574792861938, + "learning_rate": 7.788347205707492e-06, + "loss": 0.4099, + "step": 936 + }, + { + "epoch": 2.225988700564972, + "grad_norm": 0.8044326305389404, + "learning_rate": 7.785969084423306e-06, + "loss": 0.4419, + "step": 937 + }, + { + "epoch": 2.2283675289919715, + "grad_norm": 0.7921407222747803, + "learning_rate": 7.78359096313912e-06, + "loss": 0.4732, + "step": 938 + }, + { + "epoch": 2.230746357418971, + "grad_norm": 0.8141794204711914, + "learning_rate": 7.781212841854935e-06, + "loss": 0.4213, + "step": 939 + }, + { + "epoch": 2.233125185845971, + "grad_norm": 0.8599123954772949, + "learning_rate": 7.778834720570749e-06, + "loss": 0.4641, + "step": 940 + }, + { + "epoch": 2.2355040142729705, + "grad_norm": 0.7938510775566101, + "learning_rate": 7.776456599286563e-06, + "loss": 0.4324, + "step": 941 + }, + { + "epoch": 2.23788284269997, + "grad_norm": 0.9228736162185669, + "learning_rate": 7.77407847800238e-06, + "loss": 0.548, + "step": 942 + }, + { + "epoch": 2.24026167112697, + "grad_norm": 0.8649184703826904, + "learning_rate": 7.771700356718194e-06, + "loss": 0.4353, + "step": 943 + }, + { + "epoch": 2.2426404995539695, + "grad_norm": 0.7102033495903015, + "learning_rate": 7.769322235434008e-06, + "loss": 0.4092, + "step": 944 + }, + { + "epoch": 2.2450193279809696, + "grad_norm": 0.8156210780143738, + "learning_rate": 7.766944114149823e-06, + "loss": 0.4562, + "step": 945 + }, + { + "epoch": 2.2473981564079692, + "grad_norm": 0.8445010781288147, + "learning_rate": 7.764565992865637e-06, + "loss": 0.4943, + "step": 946 + }, + { + "epoch": 2.249776984834969, + "grad_norm": 0.8647717237472534, + "learning_rate": 7.76218787158145e-06, + "loss": 0.4397, + "step": 947 + }, + { + "epoch": 2.2521558132619686, + "grad_norm": 0.8202661275863647, + "learning_rate": 7.759809750297266e-06, + "loss": 0.4012, + "step": 948 + }, + { + "epoch": 2.254534641688968, + "grad_norm": 0.7931515574455261, + "learning_rate": 7.75743162901308e-06, + "loss": 0.3904, + "step": 949 + }, + { + "epoch": 2.256913470115968, + "grad_norm": 0.9051262140274048, + "learning_rate": 7.755053507728895e-06, + "loss": 0.3948, + "step": 950 + }, + { + "epoch": 2.256913470115968, + "eval_loss": 0.46028128266334534, + "eval_runtime": 24.1017, + "eval_samples_per_second": 31.035, + "eval_steps_per_second": 15.518, + "step": 950 + }, + { + "epoch": 2.2592922985429675, + "grad_norm": 0.9169323444366455, + "learning_rate": 7.752675386444709e-06, + "loss": 0.4887, + "step": 951 + }, + { + "epoch": 2.261671126969967, + "grad_norm": 0.7803930044174194, + "learning_rate": 7.750297265160523e-06, + "loss": 0.4698, + "step": 952 + }, + { + "epoch": 2.264049955396967, + "grad_norm": 0.9058618545532227, + "learning_rate": 7.747919143876338e-06, + "loss": 0.4713, + "step": 953 + }, + { + "epoch": 2.2664287838239665, + "grad_norm": 0.8882755041122437, + "learning_rate": 7.745541022592152e-06, + "loss": 0.4533, + "step": 954 + }, + { + "epoch": 2.268807612250966, + "grad_norm": 0.8060216903686523, + "learning_rate": 7.743162901307967e-06, + "loss": 0.4572, + "step": 955 + }, + { + "epoch": 2.2711864406779663, + "grad_norm": 0.8613508343696594, + "learning_rate": 7.740784780023783e-06, + "loss": 0.4963, + "step": 956 + }, + { + "epoch": 2.273565269104966, + "grad_norm": 0.9018996953964233, + "learning_rate": 7.738406658739597e-06, + "loss": 0.4734, + "step": 957 + }, + { + "epoch": 2.2759440975319656, + "grad_norm": 0.9956986904144287, + "learning_rate": 7.73602853745541e-06, + "loss": 0.3971, + "step": 958 + }, + { + "epoch": 2.2783229259589652, + "grad_norm": 0.8715051412582397, + "learning_rate": 7.733650416171226e-06, + "loss": 0.4225, + "step": 959 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 0.8763250708580017, + "learning_rate": 7.73127229488704e-06, + "loss": 0.4715, + "step": 960 + }, + { + "epoch": 2.2830805828129646, + "grad_norm": 0.7025564908981323, + "learning_rate": 7.728894173602855e-06, + "loss": 0.4111, + "step": 961 + }, + { + "epoch": 2.285459411239964, + "grad_norm": 0.8282013535499573, + "learning_rate": 7.726516052318669e-06, + "loss": 0.4688, + "step": 962 + }, + { + "epoch": 2.287838239666964, + "grad_norm": 0.8271874189376831, + "learning_rate": 7.724137931034483e-06, + "loss": 0.422, + "step": 963 + }, + { + "epoch": 2.2902170680939635, + "grad_norm": 0.7471371293067932, + "learning_rate": 7.721759809750298e-06, + "loss": 0.3903, + "step": 964 + }, + { + "epoch": 2.2925958965209636, + "grad_norm": 0.9121617674827576, + "learning_rate": 7.719381688466112e-06, + "loss": 0.4376, + "step": 965 + }, + { + "epoch": 2.2949747249479633, + "grad_norm": 0.8503472208976746, + "learning_rate": 7.717003567181927e-06, + "loss": 0.4593, + "step": 966 + }, + { + "epoch": 2.297353553374963, + "grad_norm": 0.9025017023086548, + "learning_rate": 7.714625445897741e-06, + "loss": 0.5016, + "step": 967 + }, + { + "epoch": 2.2997323818019626, + "grad_norm": 0.8059864044189453, + "learning_rate": 7.712247324613555e-06, + "loss": 0.3848, + "step": 968 + }, + { + "epoch": 2.3021112102289623, + "grad_norm": 0.7527774572372437, + "learning_rate": 7.70986920332937e-06, + "loss": 0.4178, + "step": 969 + }, + { + "epoch": 2.304490038655962, + "grad_norm": 0.7226907014846802, + "learning_rate": 7.707491082045186e-06, + "loss": 0.4458, + "step": 970 + }, + { + "epoch": 2.3068688670829616, + "grad_norm": 0.7715461254119873, + "learning_rate": 7.705112960761e-06, + "loss": 0.4129, + "step": 971 + }, + { + "epoch": 2.3092476955099612, + "grad_norm": 0.7834897637367249, + "learning_rate": 7.702734839476815e-06, + "loss": 0.4177, + "step": 972 + }, + { + "epoch": 2.311626523936961, + "grad_norm": 0.785012423992157, + "learning_rate": 7.700356718192629e-06, + "loss": 0.4261, + "step": 973 + }, + { + "epoch": 2.314005352363961, + "grad_norm": 0.8159284591674805, + "learning_rate": 7.697978596908443e-06, + "loss": 0.3899, + "step": 974 + }, + { + "epoch": 2.3163841807909606, + "grad_norm": 0.7466394901275635, + "learning_rate": 7.695600475624258e-06, + "loss": 0.3574, + "step": 975 + }, + { + "epoch": 2.3187630092179603, + "grad_norm": 0.8210408687591553, + "learning_rate": 7.693222354340072e-06, + "loss": 0.4353, + "step": 976 + }, + { + "epoch": 2.32114183764496, + "grad_norm": 0.8266494274139404, + "learning_rate": 7.690844233055886e-06, + "loss": 0.4175, + "step": 977 + }, + { + "epoch": 2.3235206660719596, + "grad_norm": 0.7417374849319458, + "learning_rate": 7.688466111771701e-06, + "loss": 0.3957, + "step": 978 + }, + { + "epoch": 2.3258994944989593, + "grad_norm": 0.7381911277770996, + "learning_rate": 7.686087990487515e-06, + "loss": 0.4048, + "step": 979 + }, + { + "epoch": 2.328278322925959, + "grad_norm": 0.8789836168289185, + "learning_rate": 7.68370986920333e-06, + "loss": 0.4262, + "step": 980 + }, + { + "epoch": 2.3306571513529586, + "grad_norm": 0.7795222997665405, + "learning_rate": 7.681331747919144e-06, + "loss": 0.4214, + "step": 981 + }, + { + "epoch": 2.3330359797799582, + "grad_norm": 0.9465823173522949, + "learning_rate": 7.678953626634958e-06, + "loss": 0.466, + "step": 982 + }, + { + "epoch": 2.335414808206958, + "grad_norm": 0.7730276584625244, + "learning_rate": 7.676575505350773e-06, + "loss": 0.3447, + "step": 983 + }, + { + "epoch": 2.3377936366339576, + "grad_norm": 0.8702282309532166, + "learning_rate": 7.674197384066589e-06, + "loss": 0.4767, + "step": 984 + }, + { + "epoch": 2.3401724650609577, + "grad_norm": 0.8919823169708252, + "learning_rate": 7.671819262782402e-06, + "loss": 0.4013, + "step": 985 + }, + { + "epoch": 2.3425512934879573, + "grad_norm": 0.8225651383399963, + "learning_rate": 7.669441141498218e-06, + "loss": 0.451, + "step": 986 + }, + { + "epoch": 2.344930121914957, + "grad_norm": 0.7931697368621826, + "learning_rate": 7.667063020214032e-06, + "loss": 0.3694, + "step": 987 + }, + { + "epoch": 2.3473089503419566, + "grad_norm": 0.9419750571250916, + "learning_rate": 7.664684898929845e-06, + "loss": 0.4445, + "step": 988 + }, + { + "epoch": 2.3496877787689563, + "grad_norm": 0.7529578804969788, + "learning_rate": 7.66230677764566e-06, + "loss": 0.4299, + "step": 989 + }, + { + "epoch": 2.352066607195956, + "grad_norm": 0.8571377396583557, + "learning_rate": 7.659928656361475e-06, + "loss": 0.3893, + "step": 990 + }, + { + "epoch": 2.3544454356229556, + "grad_norm": 0.8232892155647278, + "learning_rate": 7.65755053507729e-06, + "loss": 0.4386, + "step": 991 + }, + { + "epoch": 2.3568242640499553, + "grad_norm": 0.777484118938446, + "learning_rate": 7.655172413793104e-06, + "loss": 0.369, + "step": 992 + }, + { + "epoch": 2.359203092476955, + "grad_norm": 0.8271664381027222, + "learning_rate": 7.652794292508918e-06, + "loss": 0.4186, + "step": 993 + }, + { + "epoch": 2.361581920903955, + "grad_norm": 0.8740116953849792, + "learning_rate": 7.650416171224733e-06, + "loss": 0.4665, + "step": 994 + }, + { + "epoch": 2.3639607493309547, + "grad_norm": 0.7496222257614136, + "learning_rate": 7.648038049940547e-06, + "loss": 0.3899, + "step": 995 + }, + { + "epoch": 2.3663395777579543, + "grad_norm": 0.8872946500778198, + "learning_rate": 7.645659928656362e-06, + "loss": 0.478, + "step": 996 + }, + { + "epoch": 2.368718406184954, + "grad_norm": 0.7702075839042664, + "learning_rate": 7.643281807372178e-06, + "loss": 0.4467, + "step": 997 + }, + { + "epoch": 2.3710972346119537, + "grad_norm": 0.8498168587684631, + "learning_rate": 7.640903686087991e-06, + "loss": 0.4198, + "step": 998 + }, + { + "epoch": 2.3734760630389533, + "grad_norm": 0.9242385625839233, + "learning_rate": 7.638525564803805e-06, + "loss": 0.4923, + "step": 999 + }, + { + "epoch": 2.375854891465953, + "grad_norm": 0.8855516314506531, + "learning_rate": 7.63614744351962e-06, + "loss": 0.4374, + "step": 1000 + }, + { + "epoch": 2.375854891465953, + "eval_loss": 0.4574245512485504, + "eval_runtime": 23.2912, + "eval_samples_per_second": 32.115, + "eval_steps_per_second": 16.058, + "step": 1000 + }, + { + "epoch": 2.3782337198929526, + "grad_norm": 0.8660386204719543, + "learning_rate": 7.633769322235434e-06, + "loss": 0.4629, + "step": 1001 + }, + { + "epoch": 2.3806125483199523, + "grad_norm": 0.8469867706298828, + "learning_rate": 7.63139120095125e-06, + "loss": 0.4416, + "step": 1002 + }, + { + "epoch": 2.382991376746952, + "grad_norm": 0.8332631587982178, + "learning_rate": 7.629013079667064e-06, + "loss": 0.4942, + "step": 1003 + }, + { + "epoch": 2.3853702051739516, + "grad_norm": 0.8767533898353577, + "learning_rate": 7.626634958382878e-06, + "loss": 0.4364, + "step": 1004 + }, + { + "epoch": 2.3877490336009517, + "grad_norm": 0.8398448824882507, + "learning_rate": 7.624256837098692e-06, + "loss": 0.481, + "step": 1005 + }, + { + "epoch": 2.3901278620279514, + "grad_norm": 0.7706701159477234, + "learning_rate": 7.621878715814507e-06, + "loss": 0.419, + "step": 1006 + }, + { + "epoch": 2.392506690454951, + "grad_norm": 0.6731982827186584, + "learning_rate": 7.619500594530321e-06, + "loss": 0.366, + "step": 1007 + }, + { + "epoch": 2.3948855188819507, + "grad_norm": 0.8492178916931152, + "learning_rate": 7.617122473246136e-06, + "loss": 0.3966, + "step": 1008 + }, + { + "epoch": 2.3972643473089503, + "grad_norm": 0.8632633090019226, + "learning_rate": 7.6147443519619505e-06, + "loss": 0.4395, + "step": 1009 + }, + { + "epoch": 2.39964317573595, + "grad_norm": 0.8129571080207825, + "learning_rate": 7.612366230677766e-06, + "loss": 0.4348, + "step": 1010 + }, + { + "epoch": 2.4020220041629496, + "grad_norm": 0.7998048067092896, + "learning_rate": 7.60998810939358e-06, + "loss": 0.4353, + "step": 1011 + }, + { + "epoch": 2.4044008325899493, + "grad_norm": 0.8238270878791809, + "learning_rate": 7.607609988109394e-06, + "loss": 0.4306, + "step": 1012 + }, + { + "epoch": 2.406779661016949, + "grad_norm": 0.8458043336868286, + "learning_rate": 7.605231866825209e-06, + "loss": 0.4118, + "step": 1013 + }, + { + "epoch": 2.409158489443949, + "grad_norm": 0.7849264740943909, + "learning_rate": 7.6028537455410236e-06, + "loss": 0.4128, + "step": 1014 + }, + { + "epoch": 2.4115373178709487, + "grad_norm": 0.8215279579162598, + "learning_rate": 7.600475624256838e-06, + "loss": 0.4356, + "step": 1015 + }, + { + "epoch": 2.4139161462979484, + "grad_norm": 0.9095781445503235, + "learning_rate": 7.598097502972652e-06, + "loss": 0.5023, + "step": 1016 + }, + { + "epoch": 2.416294974724948, + "grad_norm": 0.9642409682273865, + "learning_rate": 7.5957193816884665e-06, + "loss": 0.4223, + "step": 1017 + }, + { + "epoch": 2.4186738031519477, + "grad_norm": 0.9322527050971985, + "learning_rate": 7.593341260404281e-06, + "loss": 0.4262, + "step": 1018 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.808147132396698, + "learning_rate": 7.590963139120096e-06, + "loss": 0.4178, + "step": 1019 + }, + { + "epoch": 2.423431460005947, + "grad_norm": 0.6912996172904968, + "learning_rate": 7.5885850178359095e-06, + "loss": 0.3916, + "step": 1020 + }, + { + "epoch": 2.4258102884329467, + "grad_norm": 0.8006377816200256, + "learning_rate": 7.586206896551724e-06, + "loss": 0.3951, + "step": 1021 + }, + { + "epoch": 2.4281891168599463, + "grad_norm": 0.9005234837532043, + "learning_rate": 7.583828775267539e-06, + "loss": 0.4138, + "step": 1022 + }, + { + "epoch": 2.4305679452869464, + "grad_norm": 0.8165950179100037, + "learning_rate": 7.581450653983353e-06, + "loss": 0.3533, + "step": 1023 + }, + { + "epoch": 2.432946773713946, + "grad_norm": 0.7809553146362305, + "learning_rate": 7.579072532699169e-06, + "loss": 0.4572, + "step": 1024 + }, + { + "epoch": 2.4353256021409457, + "grad_norm": 0.8168447017669678, + "learning_rate": 7.5766944114149834e-06, + "loss": 0.3508, + "step": 1025 + }, + { + "epoch": 2.4377044305679454, + "grad_norm": 0.7898179292678833, + "learning_rate": 7.574316290130797e-06, + "loss": 0.4392, + "step": 1026 + }, + { + "epoch": 2.440083258994945, + "grad_norm": 0.8122592568397522, + "learning_rate": 7.571938168846612e-06, + "loss": 0.4618, + "step": 1027 + }, + { + "epoch": 2.4424620874219447, + "grad_norm": 1.0751276016235352, + "learning_rate": 7.569560047562426e-06, + "loss": 0.509, + "step": 1028 + }, + { + "epoch": 2.4448409158489444, + "grad_norm": 1.086591362953186, + "learning_rate": 7.567181926278241e-06, + "loss": 0.4968, + "step": 1029 + }, + { + "epoch": 2.447219744275944, + "grad_norm": 0.9699773192405701, + "learning_rate": 7.564803804994056e-06, + "loss": 0.4787, + "step": 1030 + }, + { + "epoch": 2.4495985727029437, + "grad_norm": 0.9711874723434448, + "learning_rate": 7.562425683709869e-06, + "loss": 0.4839, + "step": 1031 + }, + { + "epoch": 2.4519774011299433, + "grad_norm": 0.8436115384101868, + "learning_rate": 7.560047562425684e-06, + "loss": 0.4152, + "step": 1032 + }, + { + "epoch": 2.454356229556943, + "grad_norm": 0.8428522348403931, + "learning_rate": 7.557669441141499e-06, + "loss": 0.404, + "step": 1033 + }, + { + "epoch": 2.456735057983943, + "grad_norm": 0.7196497321128845, + "learning_rate": 7.555291319857313e-06, + "loss": 0.4125, + "step": 1034 + }, + { + "epoch": 2.4591138864109428, + "grad_norm": 0.8883410096168518, + "learning_rate": 7.552913198573128e-06, + "loss": 0.4078, + "step": 1035 + }, + { + "epoch": 2.4614927148379424, + "grad_norm": 0.8354674577713013, + "learning_rate": 7.550535077288942e-06, + "loss": 0.451, + "step": 1036 + }, + { + "epoch": 2.463871543264942, + "grad_norm": 0.9601000547409058, + "learning_rate": 7.548156956004757e-06, + "loss": 0.46, + "step": 1037 + }, + { + "epoch": 2.4662503716919417, + "grad_norm": 1.0465853214263916, + "learning_rate": 7.545778834720572e-06, + "loss": 0.4322, + "step": 1038 + }, + { + "epoch": 2.4686292001189414, + "grad_norm": 0.8815494179725647, + "learning_rate": 7.543400713436386e-06, + "loss": 0.4632, + "step": 1039 + }, + { + "epoch": 2.471008028545941, + "grad_norm": 0.9254335165023804, + "learning_rate": 7.541022592152201e-06, + "loss": 0.4438, + "step": 1040 + }, + { + "epoch": 2.4733868569729407, + "grad_norm": 0.784038782119751, + "learning_rate": 7.5386444708680155e-06, + "loss": 0.4168, + "step": 1041 + }, + { + "epoch": 2.4757656853999404, + "grad_norm": 0.8087180852890015, + "learning_rate": 7.536266349583829e-06, + "loss": 0.4297, + "step": 1042 + }, + { + "epoch": 2.4781445138269405, + "grad_norm": 0.8842628598213196, + "learning_rate": 7.533888228299644e-06, + "loss": 0.4424, + "step": 1043 + }, + { + "epoch": 2.48052334225394, + "grad_norm": 0.7972791194915771, + "learning_rate": 7.5315101070154585e-06, + "loss": 0.4086, + "step": 1044 + }, + { + "epoch": 2.4829021706809398, + "grad_norm": 0.8090541362762451, + "learning_rate": 7.529131985731273e-06, + "loss": 0.4553, + "step": 1045 + }, + { + "epoch": 2.4852809991079394, + "grad_norm": 1.0093398094177246, + "learning_rate": 7.526753864447087e-06, + "loss": 0.4309, + "step": 1046 + }, + { + "epoch": 2.487659827534939, + "grad_norm": 0.7368819117546082, + "learning_rate": 7.5243757431629015e-06, + "loss": 0.3282, + "step": 1047 + }, + { + "epoch": 2.4900386559619387, + "grad_norm": 0.8037929534912109, + "learning_rate": 7.521997621878716e-06, + "loss": 0.4103, + "step": 1048 + }, + { + "epoch": 2.4924174843889384, + "grad_norm": 0.8851643800735474, + "learning_rate": 7.519619500594531e-06, + "loss": 0.4507, + "step": 1049 + }, + { + "epoch": 2.494796312815938, + "grad_norm": 0.8400409817695618, + "learning_rate": 7.517241379310345e-06, + "loss": 0.5119, + "step": 1050 + }, + { + "epoch": 2.494796312815938, + "eval_loss": 0.45415621995925903, + "eval_runtime": 23.5726, + "eval_samples_per_second": 31.732, + "eval_steps_per_second": 15.866, + "step": 1050 + }, + { + "epoch": 2.4971751412429377, + "grad_norm": 0.8481485247612, + "learning_rate": 7.514863258026161e-06, + "loss": 0.4947, + "step": 1051 + }, + { + "epoch": 2.4995539696699374, + "grad_norm": 0.8240973353385925, + "learning_rate": 7.5124851367419745e-06, + "loss": 0.4718, + "step": 1052 + }, + { + "epoch": 2.501932798096937, + "grad_norm": 0.9148377776145935, + "learning_rate": 7.510107015457789e-06, + "loss": 0.433, + "step": 1053 + }, + { + "epoch": 2.504311626523937, + "grad_norm": 0.7170572876930237, + "learning_rate": 7.507728894173604e-06, + "loss": 0.4196, + "step": 1054 + }, + { + "epoch": 2.506690454950937, + "grad_norm": 0.9177691340446472, + "learning_rate": 7.505350772889418e-06, + "loss": 0.4204, + "step": 1055 + }, + { + "epoch": 2.5090692833779364, + "grad_norm": 0.8131303787231445, + "learning_rate": 7.502972651605233e-06, + "loss": 0.3749, + "step": 1056 + }, + { + "epoch": 2.511448111804936, + "grad_norm": 0.9637304544448853, + "learning_rate": 7.500594530321047e-06, + "loss": 0.511, + "step": 1057 + }, + { + "epoch": 2.5138269402319358, + "grad_norm": 0.8829891681671143, + "learning_rate": 7.498216409036861e-06, + "loss": 0.3903, + "step": 1058 + }, + { + "epoch": 2.5162057686589354, + "grad_norm": 0.7840805649757385, + "learning_rate": 7.495838287752676e-06, + "loss": 0.344, + "step": 1059 + }, + { + "epoch": 2.518584597085935, + "grad_norm": 0.9543834924697876, + "learning_rate": 7.493460166468491e-06, + "loss": 0.4329, + "step": 1060 + }, + { + "epoch": 2.5209634255129347, + "grad_norm": 0.8606633543968201, + "learning_rate": 7.491082045184304e-06, + "loss": 0.3607, + "step": 1061 + }, + { + "epoch": 2.5233422539399344, + "grad_norm": 0.8122538924217224, + "learning_rate": 7.488703923900119e-06, + "loss": 0.4276, + "step": 1062 + }, + { + "epoch": 2.5257210823669345, + "grad_norm": 0.8803274631500244, + "learning_rate": 7.4863258026159336e-06, + "loss": 0.5132, + "step": 1063 + }, + { + "epoch": 2.528099910793934, + "grad_norm": 0.9252219200134277, + "learning_rate": 7.483947681331748e-06, + "loss": 0.4139, + "step": 1064 + }, + { + "epoch": 2.530478739220934, + "grad_norm": 0.7352156639099121, + "learning_rate": 7.481569560047564e-06, + "loss": 0.4336, + "step": 1065 + }, + { + "epoch": 2.5328575676479335, + "grad_norm": 0.7874597311019897, + "learning_rate": 7.479191438763378e-06, + "loss": 0.4129, + "step": 1066 + }, + { + "epoch": 2.535236396074933, + "grad_norm": 0.9504901766777039, + "learning_rate": 7.476813317479192e-06, + "loss": 0.463, + "step": 1067 + }, + { + "epoch": 2.5376152245019328, + "grad_norm": 0.8163108229637146, + "learning_rate": 7.474435196195007e-06, + "loss": 0.3959, + "step": 1068 + }, + { + "epoch": 2.5399940529289324, + "grad_norm": 0.8390555381774902, + "learning_rate": 7.472057074910821e-06, + "loss": 0.3948, + "step": 1069 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 0.8205837607383728, + "learning_rate": 7.469678953626636e-06, + "loss": 0.4396, + "step": 1070 + }, + { + "epoch": 2.5447517097829317, + "grad_norm": 0.9405626058578491, + "learning_rate": 7.4673008323424505e-06, + "loss": 0.4914, + "step": 1071 + }, + { + "epoch": 2.547130538209932, + "grad_norm": 0.8709831237792969, + "learning_rate": 7.464922711058264e-06, + "loss": 0.4718, + "step": 1072 + }, + { + "epoch": 2.549509366636931, + "grad_norm": 0.8799251317977905, + "learning_rate": 7.462544589774079e-06, + "loss": 0.3849, + "step": 1073 + }, + { + "epoch": 2.551888195063931, + "grad_norm": 0.9503681659698486, + "learning_rate": 7.4601664684898934e-06, + "loss": 0.481, + "step": 1074 + }, + { + "epoch": 2.554267023490931, + "grad_norm": 0.8706904053688049, + "learning_rate": 7.457788347205708e-06, + "loss": 0.4338, + "step": 1075 + }, + { + "epoch": 2.5566458519179305, + "grad_norm": 0.8905729651451111, + "learning_rate": 7.455410225921522e-06, + "loss": 0.4613, + "step": 1076 + }, + { + "epoch": 2.55902468034493, + "grad_norm": 0.8659430146217346, + "learning_rate": 7.4530321046373364e-06, + "loss": 0.3905, + "step": 1077 + }, + { + "epoch": 2.56140350877193, + "grad_norm": 0.8130545020103455, + "learning_rate": 7.450653983353152e-06, + "loss": 0.3887, + "step": 1078 + }, + { + "epoch": 2.5637823371989295, + "grad_norm": 0.7481282353401184, + "learning_rate": 7.4482758620689665e-06, + "loss": 0.4762, + "step": 1079 + }, + { + "epoch": 2.566161165625929, + "grad_norm": 0.9743167757987976, + "learning_rate": 7.445897740784781e-06, + "loss": 0.4383, + "step": 1080 + }, + { + "epoch": 2.568539994052929, + "grad_norm": 0.915328323841095, + "learning_rate": 7.443519619500596e-06, + "loss": 0.4543, + "step": 1081 + }, + { + "epoch": 2.5709188224799284, + "grad_norm": 0.7975335717201233, + "learning_rate": 7.4411414982164095e-06, + "loss": 0.4067, + "step": 1082 + }, + { + "epoch": 2.5732976509069285, + "grad_norm": 0.725409209728241, + "learning_rate": 7.438763376932224e-06, + "loss": 0.3797, + "step": 1083 + }, + { + "epoch": 2.575676479333928, + "grad_norm": 0.9249476194381714, + "learning_rate": 7.436385255648039e-06, + "loss": 0.381, + "step": 1084 + }, + { + "epoch": 2.578055307760928, + "grad_norm": 0.9382577538490295, + "learning_rate": 7.434007134363853e-06, + "loss": 0.5545, + "step": 1085 + }, + { + "epoch": 2.5804341361879275, + "grad_norm": 0.7554445266723633, + "learning_rate": 7.431629013079668e-06, + "loss": 0.3713, + "step": 1086 + }, + { + "epoch": 2.582812964614927, + "grad_norm": 0.9321329593658447, + "learning_rate": 7.429250891795482e-06, + "loss": 0.4606, + "step": 1087 + }, + { + "epoch": 2.585191793041927, + "grad_norm": 0.9284448027610779, + "learning_rate": 7.426872770511296e-06, + "loss": 0.4157, + "step": 1088 + }, + { + "epoch": 2.5875706214689265, + "grad_norm": 0.8226819038391113, + "learning_rate": 7.424494649227111e-06, + "loss": 0.4222, + "step": 1089 + }, + { + "epoch": 2.589949449895926, + "grad_norm": 0.8636748790740967, + "learning_rate": 7.4221165279429255e-06, + "loss": 0.4549, + "step": 1090 + }, + { + "epoch": 2.592328278322926, + "grad_norm": 0.8430980443954468, + "learning_rate": 7.419738406658739e-06, + "loss": 0.3462, + "step": 1091 + }, + { + "epoch": 2.594707106749926, + "grad_norm": 0.7622218728065491, + "learning_rate": 7.417360285374556e-06, + "loss": 0.3794, + "step": 1092 + }, + { + "epoch": 2.597085935176925, + "grad_norm": 0.7489076256752014, + "learning_rate": 7.414982164090369e-06, + "loss": 0.3393, + "step": 1093 + }, + { + "epoch": 2.599464763603925, + "grad_norm": 0.9032859206199646, + "learning_rate": 7.412604042806184e-06, + "loss": 0.4499, + "step": 1094 + }, + { + "epoch": 2.601843592030925, + "grad_norm": 0.9318482875823975, + "learning_rate": 7.4102259215219986e-06, + "loss": 0.4122, + "step": 1095 + }, + { + "epoch": 2.6042224204579245, + "grad_norm": 0.9172553420066833, + "learning_rate": 7.407847800237813e-06, + "loss": 0.3807, + "step": 1096 + }, + { + "epoch": 2.606601248884924, + "grad_norm": 0.8228003978729248, + "learning_rate": 7.405469678953627e-06, + "loss": 0.416, + "step": 1097 + }, + { + "epoch": 2.608980077311924, + "grad_norm": 0.7377281188964844, + "learning_rate": 7.4030915576694416e-06, + "loss": 0.3626, + "step": 1098 + }, + { + "epoch": 2.6113589057389235, + "grad_norm": 0.8653618693351746, + "learning_rate": 7.400713436385256e-06, + "loss": 0.4642, + "step": 1099 + }, + { + "epoch": 2.613737734165923, + "grad_norm": 0.8590367436408997, + "learning_rate": 7.398335315101071e-06, + "loss": 0.4221, + "step": 1100 + }, + { + "epoch": 2.613737734165923, + "eval_loss": 0.4517585337162018, + "eval_runtime": 24.2062, + "eval_samples_per_second": 30.901, + "eval_steps_per_second": 15.451, + "step": 1100 + }, + { + "epoch": 2.6161165625929232, + "grad_norm": 0.9353041648864746, + "learning_rate": 7.395957193816885e-06, + "loss": 0.4744, + "step": 1101 + }, + { + "epoch": 2.6184953910199225, + "grad_norm": 0.8280140161514282, + "learning_rate": 7.393579072532699e-06, + "loss": 0.3974, + "step": 1102 + }, + { + "epoch": 2.6208742194469226, + "grad_norm": 0.933983325958252, + "learning_rate": 7.391200951248514e-06, + "loss": 0.4807, + "step": 1103 + }, + { + "epoch": 2.623253047873922, + "grad_norm": 1.1500540971755981, + "learning_rate": 7.388822829964328e-06, + "loss": 0.4747, + "step": 1104 + }, + { + "epoch": 2.625631876300922, + "grad_norm": 0.9118829369544983, + "learning_rate": 7.386444708680143e-06, + "loss": 0.4426, + "step": 1105 + }, + { + "epoch": 2.6280107047279215, + "grad_norm": 0.7580151557922363, + "learning_rate": 7.3840665873959584e-06, + "loss": 0.3316, + "step": 1106 + }, + { + "epoch": 2.630389533154921, + "grad_norm": 0.9060569405555725, + "learning_rate": 7.381688466111773e-06, + "loss": 0.4293, + "step": 1107 + }, + { + "epoch": 2.632768361581921, + "grad_norm": 0.885898232460022, + "learning_rate": 7.379310344827587e-06, + "loss": 0.4066, + "step": 1108 + }, + { + "epoch": 2.6351471900089205, + "grad_norm": 0.9551964998245239, + "learning_rate": 7.3769322235434014e-06, + "loss": 0.3974, + "step": 1109 + }, + { + "epoch": 2.63752601843592, + "grad_norm": 0.861760675907135, + "learning_rate": 7.374554102259216e-06, + "loss": 0.4374, + "step": 1110 + }, + { + "epoch": 2.63990484686292, + "grad_norm": 0.8561997413635254, + "learning_rate": 7.372175980975031e-06, + "loss": 0.4391, + "step": 1111 + }, + { + "epoch": 2.64228367528992, + "grad_norm": 0.8459076285362244, + "learning_rate": 7.369797859690844e-06, + "loss": 0.4269, + "step": 1112 + }, + { + "epoch": 2.6446625037169196, + "grad_norm": 0.9263935685157776, + "learning_rate": 7.367419738406659e-06, + "loss": 0.4336, + "step": 1113 + }, + { + "epoch": 2.6470413321439192, + "grad_norm": 1.0965032577514648, + "learning_rate": 7.365041617122474e-06, + "loss": 0.4633, + "step": 1114 + }, + { + "epoch": 2.649420160570919, + "grad_norm": 0.9005448818206787, + "learning_rate": 7.362663495838288e-06, + "loss": 0.4089, + "step": 1115 + }, + { + "epoch": 2.6517989889979185, + "grad_norm": 0.8688952922821045, + "learning_rate": 7.360285374554103e-06, + "loss": 0.3861, + "step": 1116 + }, + { + "epoch": 2.654177817424918, + "grad_norm": 0.8692402243614197, + "learning_rate": 7.357907253269917e-06, + "loss": 0.4643, + "step": 1117 + }, + { + "epoch": 2.656556645851918, + "grad_norm": 1.012959361076355, + "learning_rate": 7.355529131985731e-06, + "loss": 0.4249, + "step": 1118 + }, + { + "epoch": 2.6589354742789175, + "grad_norm": 0.9100857973098755, + "learning_rate": 7.353151010701547e-06, + "loss": 0.4108, + "step": 1119 + }, + { + "epoch": 2.661314302705917, + "grad_norm": 1.042836308479309, + "learning_rate": 7.350772889417361e-06, + "loss": 0.5077, + "step": 1120 + }, + { + "epoch": 2.6636931311329173, + "grad_norm": 0.9135213494300842, + "learning_rate": 7.348394768133176e-06, + "loss": 0.4707, + "step": 1121 + }, + { + "epoch": 2.6660719595599165, + "grad_norm": 1.1313445568084717, + "learning_rate": 7.3460166468489905e-06, + "loss": 0.5278, + "step": 1122 + }, + { + "epoch": 2.6684507879869166, + "grad_norm": 1.0106595754623413, + "learning_rate": 7.343638525564804e-06, + "loss": 0.4524, + "step": 1123 + }, + { + "epoch": 2.6708296164139163, + "grad_norm": 0.9401953816413879, + "learning_rate": 7.341260404280619e-06, + "loss": 0.4457, + "step": 1124 + }, + { + "epoch": 2.673208444840916, + "grad_norm": 0.8026012778282166, + "learning_rate": 7.3388822829964335e-06, + "loss": 0.4097, + "step": 1125 + }, + { + "epoch": 2.6755872732679156, + "grad_norm": 0.8635648488998413, + "learning_rate": 7.336504161712248e-06, + "loss": 0.4409, + "step": 1126 + }, + { + "epoch": 2.6779661016949152, + "grad_norm": 0.8960875868797302, + "learning_rate": 7.334126040428062e-06, + "loss": 0.4104, + "step": 1127 + }, + { + "epoch": 2.680344930121915, + "grad_norm": 0.7388194799423218, + "learning_rate": 7.3317479191438765e-06, + "loss": 0.2977, + "step": 1128 + }, + { + "epoch": 2.6827237585489145, + "grad_norm": 0.8879340291023254, + "learning_rate": 7.329369797859691e-06, + "loss": 0.37, + "step": 1129 + }, + { + "epoch": 2.685102586975914, + "grad_norm": 0.8800488710403442, + "learning_rate": 7.326991676575506e-06, + "loss": 0.4243, + "step": 1130 + }, + { + "epoch": 2.687481415402914, + "grad_norm": 0.8400546312332153, + "learning_rate": 7.32461355529132e-06, + "loss": 0.4247, + "step": 1131 + }, + { + "epoch": 2.689860243829914, + "grad_norm": 0.7608059048652649, + "learning_rate": 7.322235434007134e-06, + "loss": 0.3839, + "step": 1132 + }, + { + "epoch": 2.6922390722569136, + "grad_norm": 0.8763844966888428, + "learning_rate": 7.3198573127229496e-06, + "loss": 0.4345, + "step": 1133 + }, + { + "epoch": 2.6946179006839133, + "grad_norm": 0.9620002508163452, + "learning_rate": 7.317479191438764e-06, + "loss": 0.4047, + "step": 1134 + }, + { + "epoch": 2.696996729110913, + "grad_norm": 0.9018396735191345, + "learning_rate": 7.315101070154579e-06, + "loss": 0.4458, + "step": 1135 + }, + { + "epoch": 2.6993755575379126, + "grad_norm": 0.8799375891685486, + "learning_rate": 7.312722948870393e-06, + "loss": 0.391, + "step": 1136 + }, + { + "epoch": 2.7017543859649122, + "grad_norm": 1.1306402683258057, + "learning_rate": 7.310344827586208e-06, + "loss": 0.4664, + "step": 1137 + }, + { + "epoch": 2.704133214391912, + "grad_norm": 0.9824092984199524, + "learning_rate": 7.307966706302022e-06, + "loss": 0.4377, + "step": 1138 + }, + { + "epoch": 2.7065120428189116, + "grad_norm": 0.8591241240501404, + "learning_rate": 7.305588585017836e-06, + "loss": 0.382, + "step": 1139 + }, + { + "epoch": 2.708890871245911, + "grad_norm": 0.8072546720504761, + "learning_rate": 7.303210463733651e-06, + "loss": 0.3872, + "step": 1140 + }, + { + "epoch": 2.7112696996729113, + "grad_norm": 0.9364895820617676, + "learning_rate": 7.300832342449466e-06, + "loss": 0.4346, + "step": 1141 + }, + { + "epoch": 2.7136485280999105, + "grad_norm": 1.0937573909759521, + "learning_rate": 7.29845422116528e-06, + "loss": 0.44, + "step": 1142 + }, + { + "epoch": 2.7160273565269106, + "grad_norm": 0.9001314043998718, + "learning_rate": 7.296076099881094e-06, + "loss": 0.4214, + "step": 1143 + }, + { + "epoch": 2.7184061849539103, + "grad_norm": 0.8821179866790771, + "learning_rate": 7.293697978596909e-06, + "loss": 0.4664, + "step": 1144 + }, + { + "epoch": 2.72078501338091, + "grad_norm": 0.7611337900161743, + "learning_rate": 7.291319857312723e-06, + "loss": 0.398, + "step": 1145 + }, + { + "epoch": 2.7231638418079096, + "grad_norm": 0.9181427955627441, + "learning_rate": 7.288941736028539e-06, + "loss": 0.4196, + "step": 1146 + }, + { + "epoch": 2.7255426702349093, + "grad_norm": 0.8340358138084412, + "learning_rate": 7.286563614744353e-06, + "loss": 0.4411, + "step": 1147 + }, + { + "epoch": 2.727921498661909, + "grad_norm": 0.909551739692688, + "learning_rate": 7.284185493460168e-06, + "loss": 0.4447, + "step": 1148 + }, + { + "epoch": 2.7303003270889086, + "grad_norm": 0.8723570704460144, + "learning_rate": 7.281807372175982e-06, + "loss": 0.4906, + "step": 1149 + }, + { + "epoch": 2.7326791555159087, + "grad_norm": 0.9500892162322998, + "learning_rate": 7.279429250891796e-06, + "loss": 0.4334, + "step": 1150 + }, + { + "epoch": 2.7326791555159087, + "eval_loss": 0.4486580193042755, + "eval_runtime": 23.7803, + "eval_samples_per_second": 31.455, + "eval_steps_per_second": 15.727, + "step": 1150 + }, + { + "epoch": 2.735057983942908, + "grad_norm": 0.9011569023132324, + "learning_rate": 7.277051129607611e-06, + "loss": 0.3926, + "step": 1151 + }, + { + "epoch": 2.737436812369908, + "grad_norm": 0.9085838794708252, + "learning_rate": 7.2746730083234255e-06, + "loss": 0.4117, + "step": 1152 + }, + { + "epoch": 2.7398156407969076, + "grad_norm": 0.8706454634666443, + "learning_rate": 7.272294887039239e-06, + "loss": 0.4143, + "step": 1153 + }, + { + "epoch": 2.7421944692239073, + "grad_norm": 0.895187258720398, + "learning_rate": 7.269916765755054e-06, + "loss": 0.3843, + "step": 1154 + }, + { + "epoch": 2.744573297650907, + "grad_norm": 0.8368811011314392, + "learning_rate": 7.2675386444708685e-06, + "loss": 0.406, + "step": 1155 + }, + { + "epoch": 2.7469521260779066, + "grad_norm": 0.9719745516777039, + "learning_rate": 7.265160523186683e-06, + "loss": 0.4458, + "step": 1156 + }, + { + "epoch": 2.7493309545049063, + "grad_norm": 0.9641316533088684, + "learning_rate": 7.262782401902498e-06, + "loss": 0.5042, + "step": 1157 + }, + { + "epoch": 2.751709782931906, + "grad_norm": 0.8922989368438721, + "learning_rate": 7.2604042806183114e-06, + "loss": 0.3726, + "step": 1158 + }, + { + "epoch": 2.7540886113589056, + "grad_norm": 0.9970529675483704, + "learning_rate": 7.258026159334126e-06, + "loss": 0.4381, + "step": 1159 + }, + { + "epoch": 2.7564674397859052, + "grad_norm": 0.8907633423805237, + "learning_rate": 7.2556480380499415e-06, + "loss": 0.4332, + "step": 1160 + }, + { + "epoch": 2.7588462682129054, + "grad_norm": 0.9775222539901733, + "learning_rate": 7.253269916765756e-06, + "loss": 0.4176, + "step": 1161 + }, + { + "epoch": 2.761225096639905, + "grad_norm": 0.8592268824577332, + "learning_rate": 7.250891795481571e-06, + "loss": 0.4063, + "step": 1162 + }, + { + "epoch": 2.7636039250669047, + "grad_norm": 0.9334284663200378, + "learning_rate": 7.248513674197385e-06, + "loss": 0.3658, + "step": 1163 + }, + { + "epoch": 2.7659827534939043, + "grad_norm": 0.8372347950935364, + "learning_rate": 7.246135552913199e-06, + "loss": 0.4253, + "step": 1164 + }, + { + "epoch": 2.768361581920904, + "grad_norm": 0.9851978421211243, + "learning_rate": 7.243757431629014e-06, + "loss": 0.4727, + "step": 1165 + }, + { + "epoch": 2.7707404103479036, + "grad_norm": 0.8779081702232361, + "learning_rate": 7.241379310344828e-06, + "loss": 0.4486, + "step": 1166 + }, + { + "epoch": 2.7731192387749033, + "grad_norm": 1.0158051252365112, + "learning_rate": 7.239001189060643e-06, + "loss": 0.4665, + "step": 1167 + }, + { + "epoch": 2.775498067201903, + "grad_norm": 0.8791491389274597, + "learning_rate": 7.236623067776457e-06, + "loss": 0.413, + "step": 1168 + }, + { + "epoch": 2.7778768956289026, + "grad_norm": 0.8441289067268372, + "learning_rate": 7.234244946492271e-06, + "loss": 0.3928, + "step": 1169 + }, + { + "epoch": 2.7802557240559027, + "grad_norm": 0.9779683351516724, + "learning_rate": 7.231866825208086e-06, + "loss": 0.4603, + "step": 1170 + }, + { + "epoch": 2.782634552482902, + "grad_norm": 0.9129390716552734, + "learning_rate": 7.2294887039239005e-06, + "loss": 0.3737, + "step": 1171 + }, + { + "epoch": 2.785013380909902, + "grad_norm": 0.9201071262359619, + "learning_rate": 7.227110582639715e-06, + "loss": 0.4837, + "step": 1172 + }, + { + "epoch": 2.7873922093369017, + "grad_norm": 0.9442705512046814, + "learning_rate": 7.224732461355529e-06, + "loss": 0.4923, + "step": 1173 + }, + { + "epoch": 2.7897710377639013, + "grad_norm": 0.8995589017868042, + "learning_rate": 7.222354340071344e-06, + "loss": 0.3988, + "step": 1174 + }, + { + "epoch": 2.792149866190901, + "grad_norm": 0.9019346833229065, + "learning_rate": 7.219976218787159e-06, + "loss": 0.4066, + "step": 1175 + }, + { + "epoch": 2.7945286946179007, + "grad_norm": 0.9545390605926514, + "learning_rate": 7.217598097502974e-06, + "loss": 0.4441, + "step": 1176 + }, + { + "epoch": 2.7969075230449003, + "grad_norm": 0.9144545197486877, + "learning_rate": 7.215219976218788e-06, + "loss": 0.4616, + "step": 1177 + }, + { + "epoch": 2.7992863514719, + "grad_norm": 0.7605383992195129, + "learning_rate": 7.212841854934603e-06, + "loss": 0.3796, + "step": 1178 + }, + { + "epoch": 2.8016651798988996, + "grad_norm": 0.9481088519096375, + "learning_rate": 7.2104637336504166e-06, + "loss": 0.3806, + "step": 1179 + }, + { + "epoch": 2.8040440083258993, + "grad_norm": 0.8580485582351685, + "learning_rate": 7.208085612366231e-06, + "loss": 0.3733, + "step": 1180 + }, + { + "epoch": 2.8064228367528994, + "grad_norm": 0.8891831636428833, + "learning_rate": 7.205707491082046e-06, + "loss": 0.4286, + "step": 1181 + }, + { + "epoch": 2.808801665179899, + "grad_norm": 0.8241470456123352, + "learning_rate": 7.20332936979786e-06, + "loss": 0.385, + "step": 1182 + }, + { + "epoch": 2.8111804936068987, + "grad_norm": 0.9038121700286865, + "learning_rate": 7.200951248513674e-06, + "loss": 0.4072, + "step": 1183 + }, + { + "epoch": 2.8135593220338984, + "grad_norm": 0.8310014605522156, + "learning_rate": 7.198573127229489e-06, + "loss": 0.3995, + "step": 1184 + }, + { + "epoch": 2.815938150460898, + "grad_norm": 0.8004698157310486, + "learning_rate": 7.196195005945303e-06, + "loss": 0.3813, + "step": 1185 + }, + { + "epoch": 2.8183169788878977, + "grad_norm": 0.8052743077278137, + "learning_rate": 7.193816884661118e-06, + "loss": 0.424, + "step": 1186 + }, + { + "epoch": 2.8206958073148973, + "grad_norm": 0.901412308216095, + "learning_rate": 7.1914387633769335e-06, + "loss": 0.3483, + "step": 1187 + }, + { + "epoch": 2.823074635741897, + "grad_norm": 1.0076175928115845, + "learning_rate": 7.189060642092748e-06, + "loss": 0.3676, + "step": 1188 + }, + { + "epoch": 2.8254534641688966, + "grad_norm": 1.0177217721939087, + "learning_rate": 7.186682520808562e-06, + "loss": 0.5058, + "step": 1189 + }, + { + "epoch": 2.8278322925958967, + "grad_norm": 0.9994361400604248, + "learning_rate": 7.1843043995243764e-06, + "loss": 0.3845, + "step": 1190 + }, + { + "epoch": 2.830211121022896, + "grad_norm": 0.8604933619499207, + "learning_rate": 7.181926278240191e-06, + "loss": 0.3835, + "step": 1191 + }, + { + "epoch": 2.832589949449896, + "grad_norm": 0.9893335103988647, + "learning_rate": 7.179548156956006e-06, + "loss": 0.4962, + "step": 1192 + }, + { + "epoch": 2.8349687778768957, + "grad_norm": 0.8989253640174866, + "learning_rate": 7.17717003567182e-06, + "loss": 0.3471, + "step": 1193 + }, + { + "epoch": 2.8373476063038954, + "grad_norm": 0.9898166060447693, + "learning_rate": 7.174791914387634e-06, + "loss": 0.4688, + "step": 1194 + }, + { + "epoch": 2.839726434730895, + "grad_norm": 1.0009859800338745, + "learning_rate": 7.172413793103449e-06, + "loss": 0.4376, + "step": 1195 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.9105443358421326, + "learning_rate": 7.170035671819263e-06, + "loss": 0.4168, + "step": 1196 + }, + { + "epoch": 2.8444840915848943, + "grad_norm": 0.8202179074287415, + "learning_rate": 7.167657550535078e-06, + "loss": 0.3875, + "step": 1197 + }, + { + "epoch": 2.846862920011894, + "grad_norm": 0.9063593745231628, + "learning_rate": 7.165279429250892e-06, + "loss": 0.3298, + "step": 1198 + }, + { + "epoch": 2.849241748438894, + "grad_norm": 0.8110963106155396, + "learning_rate": 7.162901307966706e-06, + "loss": 0.3565, + "step": 1199 + }, + { + "epoch": 2.8516205768658933, + "grad_norm": 0.8527550101280212, + "learning_rate": 7.160523186682521e-06, + "loss": 0.4105, + "step": 1200 + }, + { + "epoch": 2.8516205768658933, + "eval_loss": 0.4469766914844513, + "eval_runtime": 23.7519, + "eval_samples_per_second": 31.492, + "eval_steps_per_second": 15.746, + "step": 1200 + }, + { + "epoch": 2.8539994052928934, + "grad_norm": 1.0001276731491089, + "learning_rate": 7.158145065398336e-06, + "loss": 0.4267, + "step": 1201 + }, + { + "epoch": 2.856378233719893, + "grad_norm": 0.9700866341590881, + "learning_rate": 7.155766944114151e-06, + "loss": 0.4324, + "step": 1202 + }, + { + "epoch": 2.8587570621468927, + "grad_norm": 0.9702510237693787, + "learning_rate": 7.1533888228299655e-06, + "loss": 0.5001, + "step": 1203 + }, + { + "epoch": 2.8611358905738924, + "grad_norm": 0.8229884505271912, + "learning_rate": 7.151010701545779e-06, + "loss": 0.3884, + "step": 1204 + }, + { + "epoch": 2.863514719000892, + "grad_norm": 1.019861102104187, + "learning_rate": 7.148632580261594e-06, + "loss": 0.4643, + "step": 1205 + }, + { + "epoch": 2.8658935474278917, + "grad_norm": 0.9890809059143066, + "learning_rate": 7.1462544589774085e-06, + "loss": 0.4116, + "step": 1206 + }, + { + "epoch": 2.8682723758548914, + "grad_norm": 0.9212976098060608, + "learning_rate": 7.143876337693223e-06, + "loss": 0.5006, + "step": 1207 + }, + { + "epoch": 2.870651204281891, + "grad_norm": 0.7737031579017639, + "learning_rate": 7.141498216409038e-06, + "loss": 0.371, + "step": 1208 + }, + { + "epoch": 2.8730300327088907, + "grad_norm": 0.9083512425422668, + "learning_rate": 7.1391200951248515e-06, + "loss": 0.463, + "step": 1209 + }, + { + "epoch": 2.875408861135891, + "grad_norm": 1.010092854499817, + "learning_rate": 7.136741973840666e-06, + "loss": 0.4459, + "step": 1210 + }, + { + "epoch": 2.8777876895628904, + "grad_norm": 0.9902401566505432, + "learning_rate": 7.134363852556481e-06, + "loss": 0.3858, + "step": 1211 + }, + { + "epoch": 2.88016651798989, + "grad_norm": 0.9283949732780457, + "learning_rate": 7.131985731272295e-06, + "loss": 0.4559, + "step": 1212 + }, + { + "epoch": 2.8825453464168898, + "grad_norm": 0.851934552192688, + "learning_rate": 7.129607609988109e-06, + "loss": 0.3378, + "step": 1213 + }, + { + "epoch": 2.8849241748438894, + "grad_norm": 1.0234075784683228, + "learning_rate": 7.127229488703924e-06, + "loss": 0.4705, + "step": 1214 + }, + { + "epoch": 2.887303003270889, + "grad_norm": 0.8380138278007507, + "learning_rate": 7.124851367419739e-06, + "loss": 0.4178, + "step": 1215 + }, + { + "epoch": 2.8896818316978887, + "grad_norm": 0.917855978012085, + "learning_rate": 7.122473246135554e-06, + "loss": 0.4106, + "step": 1216 + }, + { + "epoch": 2.8920606601248884, + "grad_norm": 0.8371177315711975, + "learning_rate": 7.120095124851368e-06, + "loss": 0.4447, + "step": 1217 + }, + { + "epoch": 2.894439488551888, + "grad_norm": 0.861172080039978, + "learning_rate": 7.117717003567183e-06, + "loss": 0.4043, + "step": 1218 + }, + { + "epoch": 2.896818316978888, + "grad_norm": 0.9070718884468079, + "learning_rate": 7.115338882282997e-06, + "loss": 0.3961, + "step": 1219 + }, + { + "epoch": 2.8991971454058874, + "grad_norm": 0.8216188549995422, + "learning_rate": 7.112960760998811e-06, + "loss": 0.4169, + "step": 1220 + }, + { + "epoch": 2.9015759738328875, + "grad_norm": 0.8873976469039917, + "learning_rate": 7.110582639714626e-06, + "loss": 0.4755, + "step": 1221 + }, + { + "epoch": 2.903954802259887, + "grad_norm": 0.995711624622345, + "learning_rate": 7.108204518430441e-06, + "loss": 0.4695, + "step": 1222 + }, + { + "epoch": 2.9063336306868868, + "grad_norm": 1.0826010704040527, + "learning_rate": 7.105826397146255e-06, + "loss": 0.4348, + "step": 1223 + }, + { + "epoch": 2.9087124591138864, + "grad_norm": 1.059232234954834, + "learning_rate": 7.103448275862069e-06, + "loss": 0.5508, + "step": 1224 + }, + { + "epoch": 2.911091287540886, + "grad_norm": 0.7578072547912598, + "learning_rate": 7.101070154577884e-06, + "loss": 0.3448, + "step": 1225 + }, + { + "epoch": 2.9134701159678857, + "grad_norm": 0.8965063691139221, + "learning_rate": 7.098692033293698e-06, + "loss": 0.4102, + "step": 1226 + }, + { + "epoch": 2.9158489443948854, + "grad_norm": 0.9182284474372864, + "learning_rate": 7.096313912009513e-06, + "loss": 0.4211, + "step": 1227 + }, + { + "epoch": 2.918227772821885, + "grad_norm": 0.8178484439849854, + "learning_rate": 7.093935790725328e-06, + "loss": 0.3629, + "step": 1228 + }, + { + "epoch": 2.9206066012488847, + "grad_norm": 0.9502084255218506, + "learning_rate": 7.091557669441143e-06, + "loss": 0.4333, + "step": 1229 + }, + { + "epoch": 2.922985429675885, + "grad_norm": 0.922738790512085, + "learning_rate": 7.089179548156957e-06, + "loss": 0.3856, + "step": 1230 + }, + { + "epoch": 2.9253642581028845, + "grad_norm": 0.9054938554763794, + "learning_rate": 7.086801426872771e-06, + "loss": 0.4004, + "step": 1231 + }, + { + "epoch": 2.927743086529884, + "grad_norm": 0.9002532958984375, + "learning_rate": 7.084423305588586e-06, + "loss": 0.4149, + "step": 1232 + }, + { + "epoch": 2.930121914956884, + "grad_norm": 0.8404497504234314, + "learning_rate": 7.0820451843044005e-06, + "loss": 0.4861, + "step": 1233 + }, + { + "epoch": 2.9325007433838834, + "grad_norm": 0.8800899982452393, + "learning_rate": 7.079667063020215e-06, + "loss": 0.4384, + "step": 1234 + }, + { + "epoch": 2.934879571810883, + "grad_norm": 0.9527946710586548, + "learning_rate": 7.077288941736029e-06, + "loss": 0.3444, + "step": 1235 + }, + { + "epoch": 2.9372584002378828, + "grad_norm": 0.8262097835540771, + "learning_rate": 7.0749108204518435e-06, + "loss": 0.3664, + "step": 1236 + }, + { + "epoch": 2.9396372286648824, + "grad_norm": 0.8551408052444458, + "learning_rate": 7.072532699167658e-06, + "loss": 0.3411, + "step": 1237 + }, + { + "epoch": 2.942016057091882, + "grad_norm": 1.015271544456482, + "learning_rate": 7.070154577883473e-06, + "loss": 0.4808, + "step": 1238 + }, + { + "epoch": 2.944394885518882, + "grad_norm": 0.9621483087539673, + "learning_rate": 7.0677764565992864e-06, + "loss": 0.4407, + "step": 1239 + }, + { + "epoch": 2.9467737139458814, + "grad_norm": 1.0789779424667358, + "learning_rate": 7.065398335315101e-06, + "loss": 0.4207, + "step": 1240 + }, + { + "epoch": 2.9491525423728815, + "grad_norm": 0.8905445337295532, + "learning_rate": 7.063020214030916e-06, + "loss": 0.4231, + "step": 1241 + }, + { + "epoch": 2.951531370799881, + "grad_norm": 0.9205809235572815, + "learning_rate": 7.060642092746731e-06, + "loss": 0.4205, + "step": 1242 + }, + { + "epoch": 2.953910199226881, + "grad_norm": 0.9186838269233704, + "learning_rate": 7.058263971462546e-06, + "loss": 0.4004, + "step": 1243 + }, + { + "epoch": 2.9562890276538805, + "grad_norm": 0.9650325775146484, + "learning_rate": 7.05588585017836e-06, + "loss": 0.4285, + "step": 1244 + }, + { + "epoch": 2.95866785608088, + "grad_norm": 0.9170815348625183, + "learning_rate": 7.053507728894174e-06, + "loss": 0.4534, + "step": 1245 + }, + { + "epoch": 2.96104668450788, + "grad_norm": 1.2400668859481812, + "learning_rate": 7.051129607609989e-06, + "loss": 0.4768, + "step": 1246 + }, + { + "epoch": 2.9634255129348794, + "grad_norm": 0.77193284034729, + "learning_rate": 7.048751486325803e-06, + "loss": 0.379, + "step": 1247 + }, + { + "epoch": 2.9658043413618795, + "grad_norm": 1.0226027965545654, + "learning_rate": 7.046373365041618e-06, + "loss": 0.4206, + "step": 1248 + }, + { + "epoch": 2.9681831697888788, + "grad_norm": 1.0397319793701172, + "learning_rate": 7.0439952437574326e-06, + "loss": 0.4065, + "step": 1249 + }, + { + "epoch": 2.970561998215879, + "grad_norm": 0.954684317111969, + "learning_rate": 7.041617122473246e-06, + "loss": 0.4369, + "step": 1250 + }, + { + "epoch": 2.970561998215879, + "eval_loss": 0.4436522424221039, + "eval_runtime": 23.2483, + "eval_samples_per_second": 32.174, + "eval_steps_per_second": 16.087, + "step": 1250 + }, + { + "epoch": 2.9729408266428785, + "grad_norm": 0.8863274455070496, + "learning_rate": 7.039239001189061e-06, + "loss": 0.4139, + "step": 1251 + }, + { + "epoch": 2.975319655069878, + "grad_norm": 0.8522990345954895, + "learning_rate": 7.0368608799048755e-06, + "loss": 0.3337, + "step": 1252 + }, + { + "epoch": 2.977698483496878, + "grad_norm": 1.0298359394073486, + "learning_rate": 7.03448275862069e-06, + "loss": 0.4246, + "step": 1253 + }, + { + "epoch": 2.9800773119238775, + "grad_norm": 0.9469637274742126, + "learning_rate": 7.032104637336504e-06, + "loss": 0.4593, + "step": 1254 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 0.8973608016967773, + "learning_rate": 7.0297265160523185e-06, + "loss": 0.4023, + "step": 1255 + }, + { + "epoch": 2.984834968777877, + "grad_norm": 0.9289575219154358, + "learning_rate": 7.027348394768134e-06, + "loss": 0.4251, + "step": 1256 + }, + { + "epoch": 2.9872137972048765, + "grad_norm": 0.7918210029602051, + "learning_rate": 7.024970273483949e-06, + "loss": 0.3814, + "step": 1257 + }, + { + "epoch": 2.989592625631876, + "grad_norm": 1.0034514665603638, + "learning_rate": 7.022592152199763e-06, + "loss": 0.4043, + "step": 1258 + }, + { + "epoch": 2.991971454058876, + "grad_norm": 0.8934879302978516, + "learning_rate": 7.020214030915578e-06, + "loss": 0.4053, + "step": 1259 + }, + { + "epoch": 2.994350282485876, + "grad_norm": 0.9705380797386169, + "learning_rate": 7.017835909631392e-06, + "loss": 0.3934, + "step": 1260 + }, + { + "epoch": 2.9967291109128755, + "grad_norm": 0.872053861618042, + "learning_rate": 7.015457788347206e-06, + "loss": 0.3711, + "step": 1261 + }, + { + "epoch": 2.999107939339875, + "grad_norm": 1.101946234703064, + "learning_rate": 7.013079667063021e-06, + "loss": 0.4634, + "step": 1262 + }, + { + "epoch": 3.0, + "grad_norm": 1.6962785720825195, + "learning_rate": 7.010701545778835e-06, + "loss": 0.3806, + "step": 1263 + }, + { + "epoch": 3.0023788284269997, + "grad_norm": 0.9009426236152649, + "learning_rate": 7.00832342449465e-06, + "loss": 0.4644, + "step": 1264 + }, + { + "epoch": 3.0047576568539993, + "grad_norm": 0.9615646004676819, + "learning_rate": 7.005945303210464e-06, + "loss": 0.4304, + "step": 1265 + }, + { + "epoch": 3.007136485280999, + "grad_norm": 1.011570930480957, + "learning_rate": 7.003567181926278e-06, + "loss": 0.4514, + "step": 1266 + }, + { + "epoch": 3.0095153137079986, + "grad_norm": 0.9945147037506104, + "learning_rate": 7.001189060642093e-06, + "loss": 0.418, + "step": 1267 + }, + { + "epoch": 3.0118941421349987, + "grad_norm": 0.9950351119041443, + "learning_rate": 6.998810939357908e-06, + "loss": 0.3797, + "step": 1268 + }, + { + "epoch": 3.0142729705619984, + "grad_norm": 1.0433846712112427, + "learning_rate": 6.996432818073723e-06, + "loss": 0.3811, + "step": 1269 + }, + { + "epoch": 3.016651798988998, + "grad_norm": 0.8838910460472107, + "learning_rate": 6.994054696789538e-06, + "loss": 0.4118, + "step": 1270 + }, + { + "epoch": 3.0190306274159977, + "grad_norm": 0.8254680037498474, + "learning_rate": 6.9916765755053515e-06, + "loss": 0.4104, + "step": 1271 + }, + { + "epoch": 3.0214094558429974, + "grad_norm": 0.9084920883178711, + "learning_rate": 6.989298454221166e-06, + "loss": 0.3428, + "step": 1272 + }, + { + "epoch": 3.023788284269997, + "grad_norm": 1.0350019931793213, + "learning_rate": 6.986920332936981e-06, + "loss": 0.4819, + "step": 1273 + }, + { + "epoch": 3.0261671126969967, + "grad_norm": 1.001437783241272, + "learning_rate": 6.984542211652795e-06, + "loss": 0.4039, + "step": 1274 + }, + { + "epoch": 3.0285459411239963, + "grad_norm": 1.0283564329147339, + "learning_rate": 6.982164090368609e-06, + "loss": 0.384, + "step": 1275 + }, + { + "epoch": 3.030924769550996, + "grad_norm": 1.1317181587219238, + "learning_rate": 6.979785969084424e-06, + "loss": 0.4304, + "step": 1276 + }, + { + "epoch": 3.0333035979779956, + "grad_norm": 0.9317311644554138, + "learning_rate": 6.977407847800238e-06, + "loss": 0.3467, + "step": 1277 + }, + { + "epoch": 3.0356824264049957, + "grad_norm": 0.9818142652511597, + "learning_rate": 6.975029726516053e-06, + "loss": 0.3174, + "step": 1278 + }, + { + "epoch": 3.0380612548319954, + "grad_norm": 0.88284832239151, + "learning_rate": 6.9726516052318675e-06, + "loss": 0.3586, + "step": 1279 + }, + { + "epoch": 3.040440083258995, + "grad_norm": 0.9460397958755493, + "learning_rate": 6.970273483947681e-06, + "loss": 0.3944, + "step": 1280 + }, + { + "epoch": 3.0428189116859947, + "grad_norm": 0.8816782832145691, + "learning_rate": 6.967895362663496e-06, + "loss": 0.3528, + "step": 1281 + }, + { + "epoch": 3.0451977401129944, + "grad_norm": 0.8847249150276184, + "learning_rate": 6.9655172413793105e-06, + "loss": 0.4172, + "step": 1282 + }, + { + "epoch": 3.047576568539994, + "grad_norm": 0.8944831490516663, + "learning_rate": 6.963139120095126e-06, + "loss": 0.4018, + "step": 1283 + }, + { + "epoch": 3.0499553969669937, + "grad_norm": 0.9917512536048889, + "learning_rate": 6.9607609988109406e-06, + "loss": 0.4465, + "step": 1284 + }, + { + "epoch": 3.0523342253939933, + "grad_norm": 1.0698322057724, + "learning_rate": 6.958382877526755e-06, + "loss": 0.4491, + "step": 1285 + }, + { + "epoch": 3.054713053820993, + "grad_norm": 1.0102977752685547, + "learning_rate": 6.956004756242569e-06, + "loss": 0.4522, + "step": 1286 + }, + { + "epoch": 3.0570918822479927, + "grad_norm": 1.074791669845581, + "learning_rate": 6.9536266349583835e-06, + "loss": 0.4046, + "step": 1287 + }, + { + "epoch": 3.0594707106749928, + "grad_norm": 0.8747156858444214, + "learning_rate": 6.951248513674198e-06, + "loss": 0.3614, + "step": 1288 + }, + { + "epoch": 3.0618495391019924, + "grad_norm": 0.9357925653457642, + "learning_rate": 6.948870392390013e-06, + "loss": 0.375, + "step": 1289 + }, + { + "epoch": 3.064228367528992, + "grad_norm": 0.8821314573287964, + "learning_rate": 6.9464922711058265e-06, + "loss": 0.3828, + "step": 1290 + }, + { + "epoch": 3.0666071959559917, + "grad_norm": 1.1089438199996948, + "learning_rate": 6.944114149821641e-06, + "loss": 0.4424, + "step": 1291 + }, + { + "epoch": 3.0689860243829914, + "grad_norm": 0.9361308217048645, + "learning_rate": 6.941736028537456e-06, + "loss": 0.4065, + "step": 1292 + }, + { + "epoch": 3.071364852809991, + "grad_norm": 0.8765494227409363, + "learning_rate": 6.93935790725327e-06, + "loss": 0.3764, + "step": 1293 + }, + { + "epoch": 3.0737436812369907, + "grad_norm": 0.9663063287734985, + "learning_rate": 6.936979785969085e-06, + "loss": 0.4063, + "step": 1294 + }, + { + "epoch": 3.0761225096639904, + "grad_norm": 0.951029896736145, + "learning_rate": 6.934601664684899e-06, + "loss": 0.4509, + "step": 1295 + }, + { + "epoch": 3.07850133809099, + "grad_norm": 1.056810736656189, + "learning_rate": 6.932223543400713e-06, + "loss": 0.4875, + "step": 1296 + }, + { + "epoch": 3.0808801665179897, + "grad_norm": 0.844926118850708, + "learning_rate": 6.929845422116529e-06, + "loss": 0.334, + "step": 1297 + }, + { + "epoch": 3.08325899494499, + "grad_norm": 0.8137670159339905, + "learning_rate": 6.927467300832343e-06, + "loss": 0.4056, + "step": 1298 + }, + { + "epoch": 3.0856378233719894, + "grad_norm": 0.9426771998405457, + "learning_rate": 6.925089179548158e-06, + "loss": 0.4472, + "step": 1299 + }, + { + "epoch": 3.088016651798989, + "grad_norm": 0.8821436762809753, + "learning_rate": 6.922711058263973e-06, + "loss": 0.3731, + "step": 1300 + }, + { + "epoch": 3.088016651798989, + "eval_loss": 0.44242703914642334, + "eval_runtime": 23.6126, + "eval_samples_per_second": 31.678, + "eval_steps_per_second": 15.839, + "step": 1300 + }, + { + "epoch": 3.0903954802259888, + "grad_norm": 0.9075760245323181, + "learning_rate": 6.920332936979786e-06, + "loss": 0.3981, + "step": 1301 + }, + { + "epoch": 3.0927743086529884, + "grad_norm": 1.0439802408218384, + "learning_rate": 6.917954815695601e-06, + "loss": 0.4097, + "step": 1302 + }, + { + "epoch": 3.095153137079988, + "grad_norm": 0.9156201481819153, + "learning_rate": 6.915576694411416e-06, + "loss": 0.4093, + "step": 1303 + }, + { + "epoch": 3.0975319655069877, + "grad_norm": 0.9061574935913086, + "learning_rate": 6.91319857312723e-06, + "loss": 0.4006, + "step": 1304 + }, + { + "epoch": 3.0999107939339874, + "grad_norm": 0.9664219617843628, + "learning_rate": 6.910820451843044e-06, + "loss": 0.3673, + "step": 1305 + }, + { + "epoch": 3.102289622360987, + "grad_norm": 0.8090248107910156, + "learning_rate": 6.908442330558859e-06, + "loss": 0.3613, + "step": 1306 + }, + { + "epoch": 3.1046684507879867, + "grad_norm": 1.0396835803985596, + "learning_rate": 6.906064209274673e-06, + "loss": 0.4831, + "step": 1307 + }, + { + "epoch": 3.107047279214987, + "grad_norm": 0.9633468389511108, + "learning_rate": 6.903686087990488e-06, + "loss": 0.4549, + "step": 1308 + }, + { + "epoch": 3.1094261076419865, + "grad_norm": 1.078898549079895, + "learning_rate": 6.9013079667063024e-06, + "loss": 0.4499, + "step": 1309 + }, + { + "epoch": 3.111804936068986, + "grad_norm": 0.8830727934837341, + "learning_rate": 6.898929845422118e-06, + "loss": 0.3501, + "step": 1310 + }, + { + "epoch": 3.1141837644959858, + "grad_norm": 1.048128604888916, + "learning_rate": 6.896551724137932e-06, + "loss": 0.4407, + "step": 1311 + }, + { + "epoch": 3.1165625929229854, + "grad_norm": 0.8581830859184265, + "learning_rate": 6.894173602853746e-06, + "loss": 0.4193, + "step": 1312 + }, + { + "epoch": 3.118941421349985, + "grad_norm": 0.9572068452835083, + "learning_rate": 6.891795481569561e-06, + "loss": 0.3702, + "step": 1313 + }, + { + "epoch": 3.1213202497769847, + "grad_norm": 1.0561386346817017, + "learning_rate": 6.8894173602853755e-06, + "loss": 0.4908, + "step": 1314 + }, + { + "epoch": 3.1236990782039844, + "grad_norm": 1.0810679197311401, + "learning_rate": 6.88703923900119e-06, + "loss": 0.4243, + "step": 1315 + }, + { + "epoch": 3.126077906630984, + "grad_norm": 0.8239559531211853, + "learning_rate": 6.884661117717004e-06, + "loss": 0.3845, + "step": 1316 + }, + { + "epoch": 3.128456735057984, + "grad_norm": 1.0012342929840088, + "learning_rate": 6.8822829964328185e-06, + "loss": 0.3953, + "step": 1317 + }, + { + "epoch": 3.130835563484984, + "grad_norm": 0.9737454652786255, + "learning_rate": 6.879904875148633e-06, + "loss": 0.3495, + "step": 1318 + }, + { + "epoch": 3.1332143919119835, + "grad_norm": 0.9443467855453491, + "learning_rate": 6.877526753864448e-06, + "loss": 0.4072, + "step": 1319 + }, + { + "epoch": 3.135593220338983, + "grad_norm": 0.9639855027198792, + "learning_rate": 6.8751486325802615e-06, + "loss": 0.4598, + "step": 1320 + }, + { + "epoch": 3.137972048765983, + "grad_norm": 0.9538611769676208, + "learning_rate": 6.872770511296076e-06, + "loss": 0.4252, + "step": 1321 + }, + { + "epoch": 3.1403508771929824, + "grad_norm": 0.9559749960899353, + "learning_rate": 6.870392390011891e-06, + "loss": 0.4137, + "step": 1322 + }, + { + "epoch": 3.142729705619982, + "grad_norm": 0.9332882165908813, + "learning_rate": 6.868014268727705e-06, + "loss": 0.4233, + "step": 1323 + }, + { + "epoch": 3.1451085340469818, + "grad_norm": 0.9305432438850403, + "learning_rate": 6.865636147443521e-06, + "loss": 0.3731, + "step": 1324 + }, + { + "epoch": 3.1474873624739814, + "grad_norm": 0.9804054498672485, + "learning_rate": 6.863258026159335e-06, + "loss": 0.418, + "step": 1325 + }, + { + "epoch": 3.149866190900981, + "grad_norm": 0.8923971652984619, + "learning_rate": 6.860879904875149e-06, + "loss": 0.3312, + "step": 1326 + }, + { + "epoch": 3.1522450193279807, + "grad_norm": 0.9091454744338989, + "learning_rate": 6.858501783590964e-06, + "loss": 0.3726, + "step": 1327 + }, + { + "epoch": 3.154623847754981, + "grad_norm": 1.0600782632827759, + "learning_rate": 6.856123662306778e-06, + "loss": 0.3792, + "step": 1328 + }, + { + "epoch": 3.1570026761819805, + "grad_norm": 0.8605880737304688, + "learning_rate": 6.853745541022593e-06, + "loss": 0.3515, + "step": 1329 + }, + { + "epoch": 3.15938150460898, + "grad_norm": 0.9914798736572266, + "learning_rate": 6.8513674197384076e-06, + "loss": 0.4606, + "step": 1330 + }, + { + "epoch": 3.16176033303598, + "grad_norm": 0.913550615310669, + "learning_rate": 6.848989298454221e-06, + "loss": 0.3019, + "step": 1331 + }, + { + "epoch": 3.1641391614629795, + "grad_norm": 0.9396856427192688, + "learning_rate": 6.846611177170036e-06, + "loss": 0.3999, + "step": 1332 + }, + { + "epoch": 3.166517989889979, + "grad_norm": 0.8926559686660767, + "learning_rate": 6.8442330558858506e-06, + "loss": 0.4478, + "step": 1333 + }, + { + "epoch": 3.168896818316979, + "grad_norm": 0.9025640487670898, + "learning_rate": 6.841854934601665e-06, + "loss": 0.3397, + "step": 1334 + }, + { + "epoch": 3.1712756467439784, + "grad_norm": 0.9519430994987488, + "learning_rate": 6.83947681331748e-06, + "loss": 0.3819, + "step": 1335 + }, + { + "epoch": 3.173654475170978, + "grad_norm": 1.0188910961151123, + "learning_rate": 6.8370986920332935e-06, + "loss": 0.4442, + "step": 1336 + }, + { + "epoch": 3.176033303597978, + "grad_norm": 0.9398928880691528, + "learning_rate": 6.834720570749108e-06, + "loss": 0.3702, + "step": 1337 + }, + { + "epoch": 3.178412132024978, + "grad_norm": 0.9894818663597107, + "learning_rate": 6.832342449464924e-06, + "loss": 0.4073, + "step": 1338 + }, + { + "epoch": 3.1807909604519775, + "grad_norm": 1.0047675371170044, + "learning_rate": 6.829964328180738e-06, + "loss": 0.3867, + "step": 1339 + }, + { + "epoch": 3.183169788878977, + "grad_norm": 1.0889837741851807, + "learning_rate": 6.827586206896553e-06, + "loss": 0.421, + "step": 1340 + }, + { + "epoch": 3.185548617305977, + "grad_norm": 0.9876818656921387, + "learning_rate": 6.8252080856123674e-06, + "loss": 0.4309, + "step": 1341 + }, + { + "epoch": 3.1879274457329765, + "grad_norm": 1.1279778480529785, + "learning_rate": 6.822829964328181e-06, + "loss": 0.4343, + "step": 1342 + }, + { + "epoch": 3.190306274159976, + "grad_norm": 0.8841944336891174, + "learning_rate": 6.820451843043996e-06, + "loss": 0.384, + "step": 1343 + }, + { + "epoch": 3.192685102586976, + "grad_norm": 0.8207550048828125, + "learning_rate": 6.8180737217598104e-06, + "loss": 0.3692, + "step": 1344 + }, + { + "epoch": 3.1950639310139755, + "grad_norm": 1.015608310699463, + "learning_rate": 6.815695600475625e-06, + "loss": 0.4155, + "step": 1345 + }, + { + "epoch": 3.197442759440975, + "grad_norm": 0.94300776720047, + "learning_rate": 6.813317479191439e-06, + "loss": 0.4682, + "step": 1346 + }, + { + "epoch": 3.199821587867975, + "grad_norm": 1.1306087970733643, + "learning_rate": 6.810939357907253e-06, + "loss": 0.3941, + "step": 1347 + }, + { + "epoch": 3.202200416294975, + "grad_norm": 0.9265711307525635, + "learning_rate": 6.808561236623068e-06, + "loss": 0.3527, + "step": 1348 + }, + { + "epoch": 3.2045792447219745, + "grad_norm": 0.9326797723770142, + "learning_rate": 6.806183115338883e-06, + "loss": 0.3613, + "step": 1349 + }, + { + "epoch": 3.206958073148974, + "grad_norm": 0.9236562252044678, + "learning_rate": 6.803804994054697e-06, + "loss": 0.3682, + "step": 1350 + }, + { + "epoch": 3.206958073148974, + "eval_loss": 0.4410729706287384, + "eval_runtime": 23.3609, + "eval_samples_per_second": 32.019, + "eval_steps_per_second": 16.01, + "step": 1350 + }, + { + "epoch": 3.209336901575974, + "grad_norm": 0.9618815779685974, + "learning_rate": 6.801426872770513e-06, + "loss": 0.3593, + "step": 1351 + }, + { + "epoch": 3.2117157300029735, + "grad_norm": 1.0505785942077637, + "learning_rate": 6.7990487514863265e-06, + "loss": 0.3277, + "step": 1352 + }, + { + "epoch": 3.214094558429973, + "grad_norm": 0.9098619222640991, + "learning_rate": 6.796670630202141e-06, + "loss": 0.3817, + "step": 1353 + }, + { + "epoch": 3.216473386856973, + "grad_norm": 0.9664829969406128, + "learning_rate": 6.794292508917956e-06, + "loss": 0.3847, + "step": 1354 + }, + { + "epoch": 3.2188522152839725, + "grad_norm": 0.9798524975776672, + "learning_rate": 6.79191438763377e-06, + "loss": 0.3816, + "step": 1355 + }, + { + "epoch": 3.221231043710972, + "grad_norm": 1.0888880491256714, + "learning_rate": 6.789536266349585e-06, + "loss": 0.4171, + "step": 1356 + }, + { + "epoch": 3.2236098721379722, + "grad_norm": 0.9652047157287598, + "learning_rate": 6.787158145065399e-06, + "loss": 0.404, + "step": 1357 + }, + { + "epoch": 3.225988700564972, + "grad_norm": 0.9262255430221558, + "learning_rate": 6.784780023781213e-06, + "loss": 0.4114, + "step": 1358 + }, + { + "epoch": 3.2283675289919715, + "grad_norm": 1.1045382022857666, + "learning_rate": 6.782401902497028e-06, + "loss": 0.5055, + "step": 1359 + }, + { + "epoch": 3.230746357418971, + "grad_norm": 0.9476077556610107, + "learning_rate": 6.7800237812128425e-06, + "loss": 0.3349, + "step": 1360 + }, + { + "epoch": 3.233125185845971, + "grad_norm": 1.0316661596298218, + "learning_rate": 6.777645659928656e-06, + "loss": 0.4329, + "step": 1361 + }, + { + "epoch": 3.2355040142729705, + "grad_norm": 0.9138137698173523, + "learning_rate": 6.775267538644471e-06, + "loss": 0.3786, + "step": 1362 + }, + { + "epoch": 3.23788284269997, + "grad_norm": 1.0319725275039673, + "learning_rate": 6.7728894173602855e-06, + "loss": 0.3375, + "step": 1363 + }, + { + "epoch": 3.24026167112697, + "grad_norm": 0.95320725440979, + "learning_rate": 6.7705112960761e-06, + "loss": 0.318, + "step": 1364 + }, + { + "epoch": 3.2426404995539695, + "grad_norm": 0.9980164766311646, + "learning_rate": 6.7681331747919156e-06, + "loss": 0.3327, + "step": 1365 + }, + { + "epoch": 3.2450193279809696, + "grad_norm": 1.0737320184707642, + "learning_rate": 6.76575505350773e-06, + "loss": 0.4817, + "step": 1366 + }, + { + "epoch": 3.2473981564079692, + "grad_norm": 1.05515456199646, + "learning_rate": 6.763376932223544e-06, + "loss": 0.3904, + "step": 1367 + }, + { + "epoch": 3.249776984834969, + "grad_norm": 0.8104050755500793, + "learning_rate": 6.7609988109393585e-06, + "loss": 0.3364, + "step": 1368 + }, + { + "epoch": 3.2521558132619686, + "grad_norm": 1.0520200729370117, + "learning_rate": 6.758620689655173e-06, + "loss": 0.3645, + "step": 1369 + }, + { + "epoch": 3.254534641688968, + "grad_norm": 0.9811660051345825, + "learning_rate": 6.756242568370988e-06, + "loss": 0.4353, + "step": 1370 + }, + { + "epoch": 3.256913470115968, + "grad_norm": 0.9865298271179199, + "learning_rate": 6.753864447086802e-06, + "loss": 0.3512, + "step": 1371 + }, + { + "epoch": 3.2592922985429675, + "grad_norm": 1.0737806558609009, + "learning_rate": 6.751486325802616e-06, + "loss": 0.401, + "step": 1372 + }, + { + "epoch": 3.261671126969967, + "grad_norm": 1.0930390357971191, + "learning_rate": 6.749108204518431e-06, + "loss": 0.4107, + "step": 1373 + }, + { + "epoch": 3.264049955396967, + "grad_norm": 0.9813482165336609, + "learning_rate": 6.746730083234245e-06, + "loss": 0.3804, + "step": 1374 + }, + { + "epoch": 3.2664287838239665, + "grad_norm": 0.947160542011261, + "learning_rate": 6.74435196195006e-06, + "loss": 0.3727, + "step": 1375 + }, + { + "epoch": 3.268807612250966, + "grad_norm": 0.9508016705513, + "learning_rate": 6.741973840665874e-06, + "loss": 0.3389, + "step": 1376 + }, + { + "epoch": 3.2711864406779663, + "grad_norm": 0.864435613155365, + "learning_rate": 6.739595719381688e-06, + "loss": 0.3746, + "step": 1377 + }, + { + "epoch": 3.273565269104966, + "grad_norm": 1.0789365768432617, + "learning_rate": 6.737217598097503e-06, + "loss": 0.4382, + "step": 1378 + }, + { + "epoch": 3.2759440975319656, + "grad_norm": 0.9950647950172424, + "learning_rate": 6.734839476813318e-06, + "loss": 0.3825, + "step": 1379 + }, + { + "epoch": 3.2783229259589652, + "grad_norm": 0.9799882173538208, + "learning_rate": 6.732461355529133e-06, + "loss": 0.3621, + "step": 1380 + }, + { + "epoch": 3.280701754385965, + "grad_norm": 1.0391459465026855, + "learning_rate": 6.730083234244948e-06, + "loss": 0.455, + "step": 1381 + }, + { + "epoch": 3.2830805828129646, + "grad_norm": 1.000344157218933, + "learning_rate": 6.727705112960761e-06, + "loss": 0.4133, + "step": 1382 + }, + { + "epoch": 3.285459411239964, + "grad_norm": 0.9783710837364197, + "learning_rate": 6.725326991676576e-06, + "loss": 0.4342, + "step": 1383 + }, + { + "epoch": 3.287838239666964, + "grad_norm": 0.9452858567237854, + "learning_rate": 6.722948870392391e-06, + "loss": 0.3511, + "step": 1384 + }, + { + "epoch": 3.2902170680939635, + "grad_norm": 0.96871417760849, + "learning_rate": 6.720570749108205e-06, + "loss": 0.3713, + "step": 1385 + }, + { + "epoch": 3.2925958965209636, + "grad_norm": 1.0196559429168701, + "learning_rate": 6.71819262782402e-06, + "loss": 0.443, + "step": 1386 + }, + { + "epoch": 3.2949747249479633, + "grad_norm": 0.8790106773376465, + "learning_rate": 6.715814506539834e-06, + "loss": 0.3861, + "step": 1387 + }, + { + "epoch": 3.297353553374963, + "grad_norm": 1.072209358215332, + "learning_rate": 6.713436385255648e-06, + "loss": 0.4478, + "step": 1388 + }, + { + "epoch": 3.2997323818019626, + "grad_norm": 1.0417574644088745, + "learning_rate": 6.711058263971463e-06, + "loss": 0.4364, + "step": 1389 + }, + { + "epoch": 3.3021112102289623, + "grad_norm": 0.9600133895874023, + "learning_rate": 6.7086801426872774e-06, + "loss": 0.3747, + "step": 1390 + }, + { + "epoch": 3.304490038655962, + "grad_norm": 1.0975148677825928, + "learning_rate": 6.706302021403091e-06, + "loss": 0.4473, + "step": 1391 + }, + { + "epoch": 3.3068688670829616, + "grad_norm": 0.9140797257423401, + "learning_rate": 6.7039239001189075e-06, + "loss": 0.4376, + "step": 1392 + }, + { + "epoch": 3.3092476955099612, + "grad_norm": 0.8515641689300537, + "learning_rate": 6.701545778834721e-06, + "loss": 0.3221, + "step": 1393 + }, + { + "epoch": 3.311626523936961, + "grad_norm": 0.9893806576728821, + "learning_rate": 6.699167657550536e-06, + "loss": 0.3701, + "step": 1394 + }, + { + "epoch": 3.314005352363961, + "grad_norm": 1.074258804321289, + "learning_rate": 6.6967895362663505e-06, + "loss": 0.4166, + "step": 1395 + }, + { + "epoch": 3.3163841807909606, + "grad_norm": 0.9301339387893677, + "learning_rate": 6.694411414982165e-06, + "loss": 0.3888, + "step": 1396 + }, + { + "epoch": 3.3187630092179603, + "grad_norm": 0.8941574096679688, + "learning_rate": 6.692033293697979e-06, + "loss": 0.3823, + "step": 1397 + }, + { + "epoch": 3.32114183764496, + "grad_norm": 0.9893161654472351, + "learning_rate": 6.6896551724137935e-06, + "loss": 0.454, + "step": 1398 + }, + { + "epoch": 3.3235206660719596, + "grad_norm": 0.902401864528656, + "learning_rate": 6.687277051129608e-06, + "loss": 0.3475, + "step": 1399 + }, + { + "epoch": 3.3258994944989593, + "grad_norm": 1.0300347805023193, + "learning_rate": 6.684898929845423e-06, + "loss": 0.4158, + "step": 1400 + }, + { + "epoch": 3.3258994944989593, + "eval_loss": 0.43968307971954346, + "eval_runtime": 23.439, + "eval_samples_per_second": 31.913, + "eval_steps_per_second": 15.956, + "step": 1400 + }, + { + "epoch": 3.328278322925959, + "grad_norm": 1.0037415027618408, + "learning_rate": 6.682520808561237e-06, + "loss": 0.4296, + "step": 1401 + }, + { + "epoch": 3.3306571513529586, + "grad_norm": 1.0259783267974854, + "learning_rate": 6.680142687277051e-06, + "loss": 0.4207, + "step": 1402 + }, + { + "epoch": 3.3330359797799582, + "grad_norm": 1.0681015253067017, + "learning_rate": 6.677764565992866e-06, + "loss": 0.3704, + "step": 1403 + }, + { + "epoch": 3.335414808206958, + "grad_norm": 1.0281389951705933, + "learning_rate": 6.67538644470868e-06, + "loss": 0.3978, + "step": 1404 + }, + { + "epoch": 3.3377936366339576, + "grad_norm": 1.0064321756362915, + "learning_rate": 6.673008323424495e-06, + "loss": 0.347, + "step": 1405 + }, + { + "epoch": 3.3401724650609577, + "grad_norm": 1.1888902187347412, + "learning_rate": 6.67063020214031e-06, + "loss": 0.4523, + "step": 1406 + }, + { + "epoch": 3.3425512934879573, + "grad_norm": 0.9056865572929382, + "learning_rate": 6.668252080856125e-06, + "loss": 0.3905, + "step": 1407 + }, + { + "epoch": 3.344930121914957, + "grad_norm": 0.9816499948501587, + "learning_rate": 6.665873959571939e-06, + "loss": 0.4161, + "step": 1408 + }, + { + "epoch": 3.3473089503419566, + "grad_norm": 1.036423683166504, + "learning_rate": 6.663495838287753e-06, + "loss": 0.4008, + "step": 1409 + }, + { + "epoch": 3.3496877787689563, + "grad_norm": 0.978205680847168, + "learning_rate": 6.661117717003568e-06, + "loss": 0.4239, + "step": 1410 + }, + { + "epoch": 3.352066607195956, + "grad_norm": 0.9356462955474854, + "learning_rate": 6.658739595719383e-06, + "loss": 0.4298, + "step": 1411 + }, + { + "epoch": 3.3544454356229556, + "grad_norm": 1.1239317655563354, + "learning_rate": 6.656361474435196e-06, + "loss": 0.4539, + "step": 1412 + }, + { + "epoch": 3.3568242640499553, + "grad_norm": 0.9873788952827454, + "learning_rate": 6.653983353151011e-06, + "loss": 0.4441, + "step": 1413 + }, + { + "epoch": 3.359203092476955, + "grad_norm": 1.0687034130096436, + "learning_rate": 6.6516052318668256e-06, + "loss": 0.4351, + "step": 1414 + }, + { + "epoch": 3.361581920903955, + "grad_norm": 0.9933961033821106, + "learning_rate": 6.64922711058264e-06, + "loss": 0.3575, + "step": 1415 + }, + { + "epoch": 3.3639607493309547, + "grad_norm": 1.0493537187576294, + "learning_rate": 6.646848989298455e-06, + "loss": 0.4061, + "step": 1416 + }, + { + "epoch": 3.3663395777579543, + "grad_norm": 1.1057562828063965, + "learning_rate": 6.6444708680142686e-06, + "loss": 0.4121, + "step": 1417 + }, + { + "epoch": 3.368718406184954, + "grad_norm": 0.9277777075767517, + "learning_rate": 6.642092746730083e-06, + "loss": 0.3757, + "step": 1418 + }, + { + "epoch": 3.3710972346119537, + "grad_norm": 0.9733612537384033, + "learning_rate": 6.639714625445898e-06, + "loss": 0.3278, + "step": 1419 + }, + { + "epoch": 3.3734760630389533, + "grad_norm": 1.0038756132125854, + "learning_rate": 6.637336504161713e-06, + "loss": 0.3784, + "step": 1420 + }, + { + "epoch": 3.375854891465953, + "grad_norm": 0.992900013923645, + "learning_rate": 6.634958382877528e-06, + "loss": 0.4222, + "step": 1421 + }, + { + "epoch": 3.3782337198929526, + "grad_norm": 0.9515736699104309, + "learning_rate": 6.6325802615933425e-06, + "loss": 0.4318, + "step": 1422 + }, + { + "epoch": 3.3806125483199523, + "grad_norm": 0.9767106175422668, + "learning_rate": 6.630202140309156e-06, + "loss": 0.4279, + "step": 1423 + }, + { + "epoch": 3.382991376746952, + "grad_norm": 1.010475993156433, + "learning_rate": 6.627824019024971e-06, + "loss": 0.4402, + "step": 1424 + }, + { + "epoch": 3.3853702051739516, + "grad_norm": 0.979476809501648, + "learning_rate": 6.6254458977407854e-06, + "loss": 0.3823, + "step": 1425 + }, + { + "epoch": 3.3877490336009517, + "grad_norm": 1.0437548160552979, + "learning_rate": 6.6230677764566e-06, + "loss": 0.3782, + "step": 1426 + }, + { + "epoch": 3.3901278620279514, + "grad_norm": 0.8654898405075073, + "learning_rate": 6.620689655172415e-06, + "loss": 0.3604, + "step": 1427 + }, + { + "epoch": 3.392506690454951, + "grad_norm": 1.0310670137405396, + "learning_rate": 6.6183115338882284e-06, + "loss": 0.4201, + "step": 1428 + }, + { + "epoch": 3.3948855188819507, + "grad_norm": 1.1945704221725464, + "learning_rate": 6.615933412604043e-06, + "loss": 0.538, + "step": 1429 + }, + { + "epoch": 3.3972643473089503, + "grad_norm": 1.0463353395462036, + "learning_rate": 6.613555291319858e-06, + "loss": 0.4258, + "step": 1430 + }, + { + "epoch": 3.39964317573595, + "grad_norm": 0.9035159945487976, + "learning_rate": 6.611177170035672e-06, + "loss": 0.3726, + "step": 1431 + }, + { + "epoch": 3.4020220041629496, + "grad_norm": 1.131360650062561, + "learning_rate": 6.608799048751486e-06, + "loss": 0.5345, + "step": 1432 + }, + { + "epoch": 3.4044008325899493, + "grad_norm": 1.09908127784729, + "learning_rate": 6.6064209274673015e-06, + "loss": 0.4223, + "step": 1433 + }, + { + "epoch": 3.406779661016949, + "grad_norm": 1.0405066013336182, + "learning_rate": 6.604042806183116e-06, + "loss": 0.3715, + "step": 1434 + }, + { + "epoch": 3.409158489443949, + "grad_norm": 0.9897058606147766, + "learning_rate": 6.601664684898931e-06, + "loss": 0.427, + "step": 1435 + }, + { + "epoch": 3.4115373178709487, + "grad_norm": 1.0087716579437256, + "learning_rate": 6.599286563614745e-06, + "loss": 0.3725, + "step": 1436 + }, + { + "epoch": 3.4139161462979484, + "grad_norm": 0.9231634736061096, + "learning_rate": 6.59690844233056e-06, + "loss": 0.368, + "step": 1437 + }, + { + "epoch": 3.416294974724948, + "grad_norm": 1.0480480194091797, + "learning_rate": 6.594530321046374e-06, + "loss": 0.4788, + "step": 1438 + }, + { + "epoch": 3.4186738031519477, + "grad_norm": 1.050686001777649, + "learning_rate": 6.592152199762188e-06, + "loss": 0.3889, + "step": 1439 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.9522033333778381, + "learning_rate": 6.589774078478003e-06, + "loss": 0.4225, + "step": 1440 + }, + { + "epoch": 3.423431460005947, + "grad_norm": 1.091211199760437, + "learning_rate": 6.5873959571938175e-06, + "loss": 0.4293, + "step": 1441 + }, + { + "epoch": 3.4258102884329467, + "grad_norm": 1.0349845886230469, + "learning_rate": 6.585017835909632e-06, + "loss": 0.3274, + "step": 1442 + }, + { + "epoch": 3.4281891168599463, + "grad_norm": 1.1101242303848267, + "learning_rate": 6.582639714625446e-06, + "loss": 0.4116, + "step": 1443 + }, + { + "epoch": 3.4305679452869464, + "grad_norm": 0.9655921459197998, + "learning_rate": 6.5802615933412605e-06, + "loss": 0.3371, + "step": 1444 + }, + { + "epoch": 3.432946773713946, + "grad_norm": 1.1748008728027344, + "learning_rate": 6.577883472057075e-06, + "loss": 0.5111, + "step": 1445 + }, + { + "epoch": 3.4353256021409457, + "grad_norm": 1.0712281465530396, + "learning_rate": 6.57550535077289e-06, + "loss": 0.3951, + "step": 1446 + }, + { + "epoch": 3.4377044305679454, + "grad_norm": 0.9222583770751953, + "learning_rate": 6.573127229488705e-06, + "loss": 0.3813, + "step": 1447 + }, + { + "epoch": 3.440083258994945, + "grad_norm": 0.9447400569915771, + "learning_rate": 6.57074910820452e-06, + "loss": 0.395, + "step": 1448 + }, + { + "epoch": 3.4424620874219447, + "grad_norm": 0.9074904918670654, + "learning_rate": 6.5683709869203336e-06, + "loss": 0.3634, + "step": 1449 + }, + { + "epoch": 3.4448409158489444, + "grad_norm": 1.0380321741104126, + "learning_rate": 6.565992865636148e-06, + "loss": 0.4069, + "step": 1450 + }, + { + "epoch": 3.4448409158489444, + "eval_loss": 0.4380080997943878, + "eval_runtime": 23.2923, + "eval_samples_per_second": 32.114, + "eval_steps_per_second": 16.057, + "step": 1450 + }, + { + "epoch": 3.447219744275944, + "grad_norm": 1.0179827213287354, + "learning_rate": 6.563614744351963e-06, + "loss": 0.4503, + "step": 1451 + }, + { + "epoch": 3.4495985727029437, + "grad_norm": 0.9071374535560608, + "learning_rate": 6.561236623067777e-06, + "loss": 0.3424, + "step": 1452 + }, + { + "epoch": 3.4519774011299433, + "grad_norm": 0.9919977784156799, + "learning_rate": 6.558858501783591e-06, + "loss": 0.4262, + "step": 1453 + }, + { + "epoch": 3.454356229556943, + "grad_norm": 1.0517230033874512, + "learning_rate": 6.556480380499406e-06, + "loss": 0.3764, + "step": 1454 + }, + { + "epoch": 3.456735057983943, + "grad_norm": 0.9944483041763306, + "learning_rate": 6.55410225921522e-06, + "loss": 0.3266, + "step": 1455 + }, + { + "epoch": 3.4591138864109428, + "grad_norm": 1.0693600177764893, + "learning_rate": 6.551724137931035e-06, + "loss": 0.4543, + "step": 1456 + }, + { + "epoch": 3.4614927148379424, + "grad_norm": 1.0311901569366455, + "learning_rate": 6.54934601664685e-06, + "loss": 0.3953, + "step": 1457 + }, + { + "epoch": 3.463871543264942, + "grad_norm": 0.9501886367797852, + "learning_rate": 6.546967895362663e-06, + "loss": 0.4128, + "step": 1458 + }, + { + "epoch": 3.4662503716919417, + "grad_norm": 0.887793242931366, + "learning_rate": 6.544589774078478e-06, + "loss": 0.3532, + "step": 1459 + }, + { + "epoch": 3.4686292001189414, + "grad_norm": 1.0111991167068481, + "learning_rate": 6.542211652794293e-06, + "loss": 0.3908, + "step": 1460 + }, + { + "epoch": 3.471008028545941, + "grad_norm": 1.1187986135482788, + "learning_rate": 6.539833531510108e-06, + "loss": 0.4172, + "step": 1461 + }, + { + "epoch": 3.4733868569729407, + "grad_norm": 0.9068117141723633, + "learning_rate": 6.537455410225923e-06, + "loss": 0.3741, + "step": 1462 + }, + { + "epoch": 3.4757656853999404, + "grad_norm": 0.9004318118095398, + "learning_rate": 6.535077288941737e-06, + "loss": 0.3763, + "step": 1463 + }, + { + "epoch": 3.4781445138269405, + "grad_norm": 0.9837785959243774, + "learning_rate": 6.532699167657551e-06, + "loss": 0.4095, + "step": 1464 + }, + { + "epoch": 3.48052334225394, + "grad_norm": 0.9698705673217773, + "learning_rate": 6.530321046373366e-06, + "loss": 0.4093, + "step": 1465 + }, + { + "epoch": 3.4829021706809398, + "grad_norm": 0.9407247304916382, + "learning_rate": 6.52794292508918e-06, + "loss": 0.3499, + "step": 1466 + }, + { + "epoch": 3.4852809991079394, + "grad_norm": 0.8577081561088562, + "learning_rate": 6.525564803804995e-06, + "loss": 0.3263, + "step": 1467 + }, + { + "epoch": 3.487659827534939, + "grad_norm": 1.1695443391799927, + "learning_rate": 6.523186682520809e-06, + "loss": 0.4624, + "step": 1468 + }, + { + "epoch": 3.4900386559619387, + "grad_norm": 1.0677658319473267, + "learning_rate": 6.520808561236623e-06, + "loss": 0.4551, + "step": 1469 + }, + { + "epoch": 3.4924174843889384, + "grad_norm": 0.9037706851959229, + "learning_rate": 6.518430439952438e-06, + "loss": 0.4171, + "step": 1470 + }, + { + "epoch": 3.494796312815938, + "grad_norm": 1.0306446552276611, + "learning_rate": 6.5160523186682525e-06, + "loss": 0.3675, + "step": 1471 + }, + { + "epoch": 3.4971751412429377, + "grad_norm": 0.9763319492340088, + "learning_rate": 6.513674197384067e-06, + "loss": 0.4653, + "step": 1472 + }, + { + "epoch": 3.4995539696699374, + "grad_norm": 0.9333167672157288, + "learning_rate": 6.511296076099881e-06, + "loss": 0.3231, + "step": 1473 + }, + { + "epoch": 3.501932798096937, + "grad_norm": 0.938674807548523, + "learning_rate": 6.508917954815696e-06, + "loss": 0.329, + "step": 1474 + }, + { + "epoch": 3.504311626523937, + "grad_norm": 1.2521642446517944, + "learning_rate": 6.506539833531511e-06, + "loss": 0.368, + "step": 1475 + }, + { + "epoch": 3.506690454950937, + "grad_norm": 0.9009252190589905, + "learning_rate": 6.5041617122473255e-06, + "loss": 0.3579, + "step": 1476 + }, + { + "epoch": 3.5090692833779364, + "grad_norm": 1.1145917177200317, + "learning_rate": 6.50178359096314e-06, + "loss": 0.4089, + "step": 1477 + }, + { + "epoch": 3.511448111804936, + "grad_norm": 1.0986051559448242, + "learning_rate": 6.499405469678955e-06, + "loss": 0.4645, + "step": 1478 + }, + { + "epoch": 3.5138269402319358, + "grad_norm": 0.9749249815940857, + "learning_rate": 6.4970273483947685e-06, + "loss": 0.3806, + "step": 1479 + }, + { + "epoch": 3.5162057686589354, + "grad_norm": 1.0242209434509277, + "learning_rate": 6.494649227110583e-06, + "loss": 0.4643, + "step": 1480 + }, + { + "epoch": 3.518584597085935, + "grad_norm": 1.046999216079712, + "learning_rate": 6.492271105826398e-06, + "loss": 0.4171, + "step": 1481 + }, + { + "epoch": 3.5209634255129347, + "grad_norm": 1.0536119937896729, + "learning_rate": 6.489892984542212e-06, + "loss": 0.4057, + "step": 1482 + }, + { + "epoch": 3.5233422539399344, + "grad_norm": 1.0235801935195923, + "learning_rate": 6.487514863258026e-06, + "loss": 0.2988, + "step": 1483 + }, + { + "epoch": 3.5257210823669345, + "grad_norm": 1.0805996656417847, + "learning_rate": 6.485136741973841e-06, + "loss": 0.3904, + "step": 1484 + }, + { + "epoch": 3.528099910793934, + "grad_norm": 1.0035511255264282, + "learning_rate": 6.482758620689655e-06, + "loss": 0.3839, + "step": 1485 + }, + { + "epoch": 3.530478739220934, + "grad_norm": 1.181089162826538, + "learning_rate": 6.48038049940547e-06, + "loss": 0.4022, + "step": 1486 + }, + { + "epoch": 3.5328575676479335, + "grad_norm": 1.1342436075210571, + "learning_rate": 6.4780023781212845e-06, + "loss": 0.445, + "step": 1487 + }, + { + "epoch": 3.535236396074933, + "grad_norm": 1.0642750263214111, + "learning_rate": 6.4756242568371e-06, + "loss": 0.4027, + "step": 1488 + }, + { + "epoch": 3.5376152245019328, + "grad_norm": 1.0442306995391846, + "learning_rate": 6.473246135552914e-06, + "loss": 0.3659, + "step": 1489 + }, + { + "epoch": 3.5399940529289324, + "grad_norm": 1.265405297279358, + "learning_rate": 6.470868014268728e-06, + "loss": 0.3836, + "step": 1490 + }, + { + "epoch": 3.542372881355932, + "grad_norm": 1.0862177610397339, + "learning_rate": 6.468489892984543e-06, + "loss": 0.4209, + "step": 1491 + }, + { + "epoch": 3.5447517097829317, + "grad_norm": 0.9165106415748596, + "learning_rate": 6.466111771700358e-06, + "loss": 0.4006, + "step": 1492 + }, + { + "epoch": 3.547130538209932, + "grad_norm": 0.9284043908119202, + "learning_rate": 6.463733650416172e-06, + "loss": 0.3132, + "step": 1493 + }, + { + "epoch": 3.549509366636931, + "grad_norm": 1.0873234272003174, + "learning_rate": 6.461355529131986e-06, + "loss": 0.3778, + "step": 1494 + }, + { + "epoch": 3.551888195063931, + "grad_norm": 1.0475276708602905, + "learning_rate": 6.458977407847801e-06, + "loss": 0.3477, + "step": 1495 + }, + { + "epoch": 3.554267023490931, + "grad_norm": 1.1217299699783325, + "learning_rate": 6.456599286563615e-06, + "loss": 0.3782, + "step": 1496 + }, + { + "epoch": 3.5566458519179305, + "grad_norm": 1.0896966457366943, + "learning_rate": 6.45422116527943e-06, + "loss": 0.3818, + "step": 1497 + }, + { + "epoch": 3.55902468034493, + "grad_norm": 1.00677490234375, + "learning_rate": 6.4518430439952436e-06, + "loss": 0.3635, + "step": 1498 + }, + { + "epoch": 3.56140350877193, + "grad_norm": 0.9133802652359009, + "learning_rate": 6.449464922711058e-06, + "loss": 0.3746, + "step": 1499 + }, + { + "epoch": 3.5637823371989295, + "grad_norm": 0.9326611757278442, + "learning_rate": 6.447086801426873e-06, + "loss": 0.3943, + "step": 1500 + }, + { + "epoch": 3.5637823371989295, + "eval_loss": 0.4378804564476013, + "eval_runtime": 23.1317, + "eval_samples_per_second": 32.337, + "eval_steps_per_second": 16.168, + "step": 1500 + }, + { + "epoch": 3.566161165625929, + "grad_norm": 1.000311255455017, + "learning_rate": 6.444708680142688e-06, + "loss": 0.3883, + "step": 1501 + }, + { + "epoch": 3.568539994052929, + "grad_norm": 1.1306126117706299, + "learning_rate": 6.442330558858503e-06, + "loss": 0.4638, + "step": 1502 + }, + { + "epoch": 3.5709188224799284, + "grad_norm": 1.0773372650146484, + "learning_rate": 6.4399524375743175e-06, + "loss": 0.4246, + "step": 1503 + }, + { + "epoch": 3.5732976509069285, + "grad_norm": 1.0876350402832031, + "learning_rate": 6.437574316290131e-06, + "loss": 0.4652, + "step": 1504 + }, + { + "epoch": 3.575676479333928, + "grad_norm": 1.2721341848373413, + "learning_rate": 6.435196195005946e-06, + "loss": 0.4179, + "step": 1505 + }, + { + "epoch": 3.578055307760928, + "grad_norm": 0.9905206561088562, + "learning_rate": 6.4328180737217605e-06, + "loss": 0.417, + "step": 1506 + }, + { + "epoch": 3.5804341361879275, + "grad_norm": 1.0122687816619873, + "learning_rate": 6.430439952437575e-06, + "loss": 0.4308, + "step": 1507 + }, + { + "epoch": 3.582812964614927, + "grad_norm": 1.0850577354431152, + "learning_rate": 6.42806183115339e-06, + "loss": 0.4209, + "step": 1508 + }, + { + "epoch": 3.585191793041927, + "grad_norm": 0.9188113808631897, + "learning_rate": 6.4256837098692034e-06, + "loss": 0.4312, + "step": 1509 + }, + { + "epoch": 3.5875706214689265, + "grad_norm": 0.9944018125534058, + "learning_rate": 6.423305588585018e-06, + "loss": 0.3312, + "step": 1510 + }, + { + "epoch": 3.589949449895926, + "grad_norm": 0.8936811685562134, + "learning_rate": 6.420927467300833e-06, + "loss": 0.3883, + "step": 1511 + }, + { + "epoch": 3.592328278322926, + "grad_norm": 1.1572505235671997, + "learning_rate": 6.418549346016647e-06, + "loss": 0.4348, + "step": 1512 + }, + { + "epoch": 3.594707106749926, + "grad_norm": 0.9582259654998779, + "learning_rate": 6.416171224732461e-06, + "loss": 0.4161, + "step": 1513 + }, + { + "epoch": 3.597085935176925, + "grad_norm": 1.0612276792526245, + "learning_rate": 6.413793103448276e-06, + "loss": 0.3405, + "step": 1514 + }, + { + "epoch": 3.599464763603925, + "grad_norm": 0.9854185581207275, + "learning_rate": 6.411414982164091e-06, + "loss": 0.3696, + "step": 1515 + }, + { + "epoch": 3.601843592030925, + "grad_norm": 1.1510316133499146, + "learning_rate": 6.409036860879906e-06, + "loss": 0.4194, + "step": 1516 + }, + { + "epoch": 3.6042224204579245, + "grad_norm": 1.0270483493804932, + "learning_rate": 6.40665873959572e-06, + "loss": 0.3419, + "step": 1517 + }, + { + "epoch": 3.606601248884924, + "grad_norm": 0.9114523530006409, + "learning_rate": 6.404280618311535e-06, + "loss": 0.3829, + "step": 1518 + }, + { + "epoch": 3.608980077311924, + "grad_norm": 1.0689023733139038, + "learning_rate": 6.401902497027349e-06, + "loss": 0.4284, + "step": 1519 + }, + { + "epoch": 3.6113589057389235, + "grad_norm": 1.0201904773712158, + "learning_rate": 6.399524375743163e-06, + "loss": 0.3616, + "step": 1520 + }, + { + "epoch": 3.613737734165923, + "grad_norm": 1.0964332818984985, + "learning_rate": 6.397146254458978e-06, + "loss": 0.345, + "step": 1521 + }, + { + "epoch": 3.6161165625929232, + "grad_norm": 1.0271687507629395, + "learning_rate": 6.3947681331747925e-06, + "loss": 0.382, + "step": 1522 + }, + { + "epoch": 3.6184953910199225, + "grad_norm": 0.9846331477165222, + "learning_rate": 6.392390011890607e-06, + "loss": 0.3891, + "step": 1523 + }, + { + "epoch": 3.6208742194469226, + "grad_norm": 1.0800065994262695, + "learning_rate": 6.390011890606421e-06, + "loss": 0.41, + "step": 1524 + }, + { + "epoch": 3.623253047873922, + "grad_norm": 0.979745090007782, + "learning_rate": 6.3876337693222355e-06, + "loss": 0.3307, + "step": 1525 + }, + { + "epoch": 3.625631876300922, + "grad_norm": 1.080016851425171, + "learning_rate": 6.38525564803805e-06, + "loss": 0.4382, + "step": 1526 + }, + { + "epoch": 3.6280107047279215, + "grad_norm": 1.0718671083450317, + "learning_rate": 6.382877526753865e-06, + "loss": 0.4208, + "step": 1527 + }, + { + "epoch": 3.630389533154921, + "grad_norm": 1.025809645652771, + "learning_rate": 6.380499405469679e-06, + "loss": 0.3891, + "step": 1528 + }, + { + "epoch": 3.632768361581921, + "grad_norm": 0.9038216471672058, + "learning_rate": 6.378121284185495e-06, + "loss": 0.3557, + "step": 1529 + }, + { + "epoch": 3.6351471900089205, + "grad_norm": 0.9897075891494751, + "learning_rate": 6.3757431629013086e-06, + "loss": 0.3487, + "step": 1530 + }, + { + "epoch": 3.63752601843592, + "grad_norm": 0.9885250329971313, + "learning_rate": 6.373365041617123e-06, + "loss": 0.3743, + "step": 1531 + }, + { + "epoch": 3.63990484686292, + "grad_norm": 1.0781739950180054, + "learning_rate": 6.370986920332938e-06, + "loss": 0.4903, + "step": 1532 + }, + { + "epoch": 3.64228367528992, + "grad_norm": 1.1592350006103516, + "learning_rate": 6.368608799048752e-06, + "loss": 0.4545, + "step": 1533 + }, + { + "epoch": 3.6446625037169196, + "grad_norm": 1.0216777324676514, + "learning_rate": 6.366230677764567e-06, + "loss": 0.3794, + "step": 1534 + }, + { + "epoch": 3.6470413321439192, + "grad_norm": 1.2327613830566406, + "learning_rate": 6.363852556480381e-06, + "loss": 0.4113, + "step": 1535 + }, + { + "epoch": 3.649420160570919, + "grad_norm": 0.8553999662399292, + "learning_rate": 6.361474435196195e-06, + "loss": 0.3429, + "step": 1536 + }, + { + "epoch": 3.6517989889979185, + "grad_norm": 0.9914376139640808, + "learning_rate": 6.35909631391201e-06, + "loss": 0.4071, + "step": 1537 + }, + { + "epoch": 3.654177817424918, + "grad_norm": 1.0590590238571167, + "learning_rate": 6.356718192627825e-06, + "loss": 0.4807, + "step": 1538 + }, + { + "epoch": 3.656556645851918, + "grad_norm": 1.1553183794021606, + "learning_rate": 6.354340071343638e-06, + "loss": 0.4178, + "step": 1539 + }, + { + "epoch": 3.6589354742789175, + "grad_norm": 0.9712529182434082, + "learning_rate": 6.351961950059453e-06, + "loss": 0.4103, + "step": 1540 + }, + { + "epoch": 3.661314302705917, + "grad_norm": 1.0034068822860718, + "learning_rate": 6.349583828775268e-06, + "loss": 0.4318, + "step": 1541 + }, + { + "epoch": 3.6636931311329173, + "grad_norm": 0.9980916976928711, + "learning_rate": 6.347205707491083e-06, + "loss": 0.3814, + "step": 1542 + }, + { + "epoch": 3.6660719595599165, + "grad_norm": 1.0261144638061523, + "learning_rate": 6.344827586206898e-06, + "loss": 0.4186, + "step": 1543 + }, + { + "epoch": 3.6684507879869166, + "grad_norm": 1.1198776960372925, + "learning_rate": 6.342449464922712e-06, + "loss": 0.4461, + "step": 1544 + }, + { + "epoch": 3.6708296164139163, + "grad_norm": 0.9595498442649841, + "learning_rate": 6.340071343638526e-06, + "loss": 0.3852, + "step": 1545 + }, + { + "epoch": 3.673208444840916, + "grad_norm": 1.0267765522003174, + "learning_rate": 6.337693222354341e-06, + "loss": 0.4015, + "step": 1546 + }, + { + "epoch": 3.6755872732679156, + "grad_norm": 1.077759027481079, + "learning_rate": 6.335315101070155e-06, + "loss": 0.3659, + "step": 1547 + }, + { + "epoch": 3.6779661016949152, + "grad_norm": 1.0041717290878296, + "learning_rate": 6.33293697978597e-06, + "loss": 0.3942, + "step": 1548 + }, + { + "epoch": 3.680344930121915, + "grad_norm": 0.9926592707633972, + "learning_rate": 6.3305588585017845e-06, + "loss": 0.3652, + "step": 1549 + }, + { + "epoch": 3.6827237585489145, + "grad_norm": 1.0736135244369507, + "learning_rate": 6.328180737217598e-06, + "loss": 0.4559, + "step": 1550 + }, + { + "epoch": 3.6827237585489145, + "eval_loss": 0.4346874952316284, + "eval_runtime": 22.9734, + "eval_samples_per_second": 32.559, + "eval_steps_per_second": 16.28, + "step": 1550 + }, + { + "epoch": 3.685102586975914, + "grad_norm": 1.3186063766479492, + "learning_rate": 6.325802615933413e-06, + "loss": 0.3993, + "step": 1551 + }, + { + "epoch": 3.687481415402914, + "grad_norm": 0.9627320766448975, + "learning_rate": 6.3234244946492275e-06, + "loss": 0.3609, + "step": 1552 + }, + { + "epoch": 3.689860243829914, + "grad_norm": 0.9700896739959717, + "learning_rate": 6.321046373365042e-06, + "loss": 0.331, + "step": 1553 + }, + { + "epoch": 3.6922390722569136, + "grad_norm": 1.1891602277755737, + "learning_rate": 6.318668252080856e-06, + "loss": 0.5022, + "step": 1554 + }, + { + "epoch": 3.6946179006839133, + "grad_norm": 0.9691500067710876, + "learning_rate": 6.3162901307966705e-06, + "loss": 0.401, + "step": 1555 + }, + { + "epoch": 3.696996729110913, + "grad_norm": 0.9477513432502747, + "learning_rate": 6.313912009512486e-06, + "loss": 0.3505, + "step": 1556 + }, + { + "epoch": 3.6993755575379126, + "grad_norm": 0.9915082454681396, + "learning_rate": 6.3115338882283005e-06, + "loss": 0.3723, + "step": 1557 + }, + { + "epoch": 3.7017543859649122, + "grad_norm": 0.9867951273918152, + "learning_rate": 6.309155766944115e-06, + "loss": 0.3578, + "step": 1558 + }, + { + "epoch": 3.704133214391912, + "grad_norm": 0.9476925730705261, + "learning_rate": 6.30677764565993e-06, + "loss": 0.366, + "step": 1559 + }, + { + "epoch": 3.7065120428189116, + "grad_norm": 1.0304569005966187, + "learning_rate": 6.3043995243757435e-06, + "loss": 0.3928, + "step": 1560 + }, + { + "epoch": 3.708890871245911, + "grad_norm": 1.1163300275802612, + "learning_rate": 6.302021403091558e-06, + "loss": 0.3978, + "step": 1561 + }, + { + "epoch": 3.7112696996729113, + "grad_norm": 1.0014300346374512, + "learning_rate": 6.299643281807373e-06, + "loss": 0.3901, + "step": 1562 + }, + { + "epoch": 3.7136485280999105, + "grad_norm": 1.053700566291809, + "learning_rate": 6.297265160523187e-06, + "loss": 0.3427, + "step": 1563 + }, + { + "epoch": 3.7160273565269106, + "grad_norm": 1.1162186861038208, + "learning_rate": 6.294887039239002e-06, + "loss": 0.3907, + "step": 1564 + }, + { + "epoch": 3.7184061849539103, + "grad_norm": 1.0361502170562744, + "learning_rate": 6.292508917954816e-06, + "loss": 0.379, + "step": 1565 + }, + { + "epoch": 3.72078501338091, + "grad_norm": 1.1748121976852417, + "learning_rate": 6.29013079667063e-06, + "loss": 0.4177, + "step": 1566 + }, + { + "epoch": 3.7231638418079096, + "grad_norm": 1.1016088724136353, + "learning_rate": 6.287752675386445e-06, + "loss": 0.4104, + "step": 1567 + }, + { + "epoch": 3.7255426702349093, + "grad_norm": 1.0546032190322876, + "learning_rate": 6.2853745541022596e-06, + "loss": 0.384, + "step": 1568 + }, + { + "epoch": 3.727921498661909, + "grad_norm": 1.1000261306762695, + "learning_rate": 6.282996432818073e-06, + "loss": 0.375, + "step": 1569 + }, + { + "epoch": 3.7303003270889086, + "grad_norm": 1.1077104806900024, + "learning_rate": 6.28061831153389e-06, + "loss": 0.4569, + "step": 1570 + }, + { + "epoch": 3.7326791555159087, + "grad_norm": 1.0452616214752197, + "learning_rate": 6.278240190249703e-06, + "loss": 0.341, + "step": 1571 + }, + { + "epoch": 3.735057983942908, + "grad_norm": 0.9090985655784607, + "learning_rate": 6.275862068965518e-06, + "loss": 0.367, + "step": 1572 + }, + { + "epoch": 3.737436812369908, + "grad_norm": 0.9683882594108582, + "learning_rate": 6.273483947681333e-06, + "loss": 0.3597, + "step": 1573 + }, + { + "epoch": 3.7398156407969076, + "grad_norm": 0.9500783681869507, + "learning_rate": 6.271105826397147e-06, + "loss": 0.3343, + "step": 1574 + }, + { + "epoch": 3.7421944692239073, + "grad_norm": 0.9047067165374756, + "learning_rate": 6.268727705112961e-06, + "loss": 0.3407, + "step": 1575 + }, + { + "epoch": 3.744573297650907, + "grad_norm": 1.0705029964447021, + "learning_rate": 6.266349583828776e-06, + "loss": 0.4084, + "step": 1576 + }, + { + "epoch": 3.7469521260779066, + "grad_norm": 1.1653494834899902, + "learning_rate": 6.26397146254459e-06, + "loss": 0.4362, + "step": 1577 + }, + { + "epoch": 3.7493309545049063, + "grad_norm": 1.1642428636550903, + "learning_rate": 6.261593341260405e-06, + "loss": 0.435, + "step": 1578 + }, + { + "epoch": 3.751709782931906, + "grad_norm": 1.0606120824813843, + "learning_rate": 6.259215219976219e-06, + "loss": 0.4154, + "step": 1579 + }, + { + "epoch": 3.7540886113589056, + "grad_norm": 1.2212231159210205, + "learning_rate": 6.256837098692033e-06, + "loss": 0.4124, + "step": 1580 + }, + { + "epoch": 3.7564674397859052, + "grad_norm": 0.9877543449401855, + "learning_rate": 6.254458977407848e-06, + "loss": 0.3301, + "step": 1581 + }, + { + "epoch": 3.7588462682129054, + "grad_norm": 1.2293020486831665, + "learning_rate": 6.252080856123662e-06, + "loss": 0.4683, + "step": 1582 + }, + { + "epoch": 3.761225096639905, + "grad_norm": 1.0100992918014526, + "learning_rate": 6.249702734839478e-06, + "loss": 0.3821, + "step": 1583 + }, + { + "epoch": 3.7636039250669047, + "grad_norm": 1.0480844974517822, + "learning_rate": 6.2473246135552925e-06, + "loss": 0.4057, + "step": 1584 + }, + { + "epoch": 3.7659827534939043, + "grad_norm": 1.0032594203948975, + "learning_rate": 6.244946492271107e-06, + "loss": 0.3512, + "step": 1585 + }, + { + "epoch": 3.768361581920904, + "grad_norm": 0.8761650323867798, + "learning_rate": 6.242568370986921e-06, + "loss": 0.3358, + "step": 1586 + }, + { + "epoch": 3.7707404103479036, + "grad_norm": 1.0161612033843994, + "learning_rate": 6.2401902497027355e-06, + "loss": 0.3917, + "step": 1587 + }, + { + "epoch": 3.7731192387749033, + "grad_norm": 0.9477819800376892, + "learning_rate": 6.23781212841855e-06, + "loss": 0.387, + "step": 1588 + }, + { + "epoch": 3.775498067201903, + "grad_norm": 1.0154447555541992, + "learning_rate": 6.235434007134365e-06, + "loss": 0.4235, + "step": 1589 + }, + { + "epoch": 3.7778768956289026, + "grad_norm": 0.9523534178733826, + "learning_rate": 6.2330558858501784e-06, + "loss": 0.4017, + "step": 1590 + }, + { + "epoch": 3.7802557240559027, + "grad_norm": 1.0012913942337036, + "learning_rate": 6.230677764565993e-06, + "loss": 0.3937, + "step": 1591 + }, + { + "epoch": 3.782634552482902, + "grad_norm": 1.0440473556518555, + "learning_rate": 6.228299643281808e-06, + "loss": 0.4509, + "step": 1592 + }, + { + "epoch": 3.785013380909902, + "grad_norm": 0.9913232922554016, + "learning_rate": 6.225921521997622e-06, + "loss": 0.3904, + "step": 1593 + }, + { + "epoch": 3.7873922093369017, + "grad_norm": 1.1031168699264526, + "learning_rate": 6.223543400713437e-06, + "loss": 0.4092, + "step": 1594 + }, + { + "epoch": 3.7897710377639013, + "grad_norm": 0.9650037288665771, + "learning_rate": 6.221165279429251e-06, + "loss": 0.4359, + "step": 1595 + }, + { + "epoch": 3.792149866190901, + "grad_norm": 1.0663871765136719, + "learning_rate": 6.218787158145065e-06, + "loss": 0.4738, + "step": 1596 + }, + { + "epoch": 3.7945286946179007, + "grad_norm": 1.2090823650360107, + "learning_rate": 6.216409036860881e-06, + "loss": 0.4228, + "step": 1597 + }, + { + "epoch": 3.7969075230449003, + "grad_norm": 1.0195385217666626, + "learning_rate": 6.214030915576695e-06, + "loss": 0.4388, + "step": 1598 + }, + { + "epoch": 3.7992863514719, + "grad_norm": 1.027744174003601, + "learning_rate": 6.21165279429251e-06, + "loss": 0.4218, + "step": 1599 + }, + { + "epoch": 3.8016651798988996, + "grad_norm": 0.9905930161476135, + "learning_rate": 6.2092746730083246e-06, + "loss": 0.3693, + "step": 1600 + }, + { + "epoch": 3.8016651798988996, + "eval_loss": 0.43306928873062134, + "eval_runtime": 23.1968, + "eval_samples_per_second": 32.246, + "eval_steps_per_second": 16.123, + "step": 1600 + }, + { + "epoch": 3.8040440083258993, + "grad_norm": 1.1530020236968994, + "learning_rate": 6.206896551724138e-06, + "loss": 0.368, + "step": 1601 + }, + { + "epoch": 3.8064228367528994, + "grad_norm": 1.0000945329666138, + "learning_rate": 6.204518430439953e-06, + "loss": 0.3953, + "step": 1602 + }, + { + "epoch": 3.808801665179899, + "grad_norm": 1.0503944158554077, + "learning_rate": 6.2021403091557675e-06, + "loss": 0.3977, + "step": 1603 + }, + { + "epoch": 3.8111804936068987, + "grad_norm": 1.0389865636825562, + "learning_rate": 6.199762187871582e-06, + "loss": 0.3722, + "step": 1604 + }, + { + "epoch": 3.8135593220338984, + "grad_norm": 1.1230560541152954, + "learning_rate": 6.197384066587396e-06, + "loss": 0.4122, + "step": 1605 + }, + { + "epoch": 3.815938150460898, + "grad_norm": 1.20895254611969, + "learning_rate": 6.1950059453032105e-06, + "loss": 0.4515, + "step": 1606 + }, + { + "epoch": 3.8183169788878977, + "grad_norm": 1.0878880023956299, + "learning_rate": 6.192627824019025e-06, + "loss": 0.4212, + "step": 1607 + }, + { + "epoch": 3.8206958073148973, + "grad_norm": 1.0784084796905518, + "learning_rate": 6.19024970273484e-06, + "loss": 0.4765, + "step": 1608 + }, + { + "epoch": 3.823074635741897, + "grad_norm": 0.9995868802070618, + "learning_rate": 6.187871581450654e-06, + "loss": 0.3368, + "step": 1609 + }, + { + "epoch": 3.8254534641688966, + "grad_norm": 1.0124704837799072, + "learning_rate": 6.185493460166468e-06, + "loss": 0.3595, + "step": 1610 + }, + { + "epoch": 3.8278322925958967, + "grad_norm": 1.2775441408157349, + "learning_rate": 6.183115338882284e-06, + "loss": 0.442, + "step": 1611 + }, + { + "epoch": 3.830211121022896, + "grad_norm": 1.1760762929916382, + "learning_rate": 6.180737217598098e-06, + "loss": 0.3998, + "step": 1612 + }, + { + "epoch": 3.832589949449896, + "grad_norm": 1.156217336654663, + "learning_rate": 6.178359096313913e-06, + "loss": 0.4735, + "step": 1613 + }, + { + "epoch": 3.8349687778768957, + "grad_norm": 0.9696272015571594, + "learning_rate": 6.175980975029727e-06, + "loss": 0.4524, + "step": 1614 + }, + { + "epoch": 3.8373476063038954, + "grad_norm": 1.126508116722107, + "learning_rate": 6.173602853745542e-06, + "loss": 0.3754, + "step": 1615 + }, + { + "epoch": 3.839726434730895, + "grad_norm": 1.2106831073760986, + "learning_rate": 6.171224732461356e-06, + "loss": 0.3816, + "step": 1616 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 1.0491403341293335, + "learning_rate": 6.16884661117717e-06, + "loss": 0.3805, + "step": 1617 + }, + { + "epoch": 3.8444840915848943, + "grad_norm": 1.1031827926635742, + "learning_rate": 6.166468489892985e-06, + "loss": 0.4422, + "step": 1618 + }, + { + "epoch": 3.846862920011894, + "grad_norm": 0.9999046325683594, + "learning_rate": 6.1640903686088e-06, + "loss": 0.3497, + "step": 1619 + }, + { + "epoch": 3.849241748438894, + "grad_norm": 1.0008429288864136, + "learning_rate": 6.161712247324614e-06, + "loss": 0.445, + "step": 1620 + }, + { + "epoch": 3.8516205768658933, + "grad_norm": 0.8759855031967163, + "learning_rate": 6.159334126040428e-06, + "loss": 0.3919, + "step": 1621 + }, + { + "epoch": 3.8539994052928934, + "grad_norm": 1.0664228200912476, + "learning_rate": 6.156956004756243e-06, + "loss": 0.398, + "step": 1622 + }, + { + "epoch": 3.856378233719893, + "grad_norm": 1.0862411260604858, + "learning_rate": 6.154577883472057e-06, + "loss": 0.3911, + "step": 1623 + }, + { + "epoch": 3.8587570621468927, + "grad_norm": 1.0657200813293457, + "learning_rate": 6.152199762187873e-06, + "loss": 0.3708, + "step": 1624 + }, + { + "epoch": 3.8611358905738924, + "grad_norm": 1.2011994123458862, + "learning_rate": 6.149821640903687e-06, + "loss": 0.3625, + "step": 1625 + }, + { + "epoch": 3.863514719000892, + "grad_norm": 1.1315374374389648, + "learning_rate": 6.147443519619501e-06, + "loss": 0.4363, + "step": 1626 + }, + { + "epoch": 3.8658935474278917, + "grad_norm": 1.2873175144195557, + "learning_rate": 6.145065398335316e-06, + "loss": 0.4273, + "step": 1627 + }, + { + "epoch": 3.8682723758548914, + "grad_norm": 0.9840818643569946, + "learning_rate": 6.14268727705113e-06, + "loss": 0.3817, + "step": 1628 + }, + { + "epoch": 3.870651204281891, + "grad_norm": 1.1053478717803955, + "learning_rate": 6.140309155766945e-06, + "loss": 0.3955, + "step": 1629 + }, + { + "epoch": 3.8730300327088907, + "grad_norm": 1.1687885522842407, + "learning_rate": 6.1379310344827595e-06, + "loss": 0.3958, + "step": 1630 + }, + { + "epoch": 3.875408861135891, + "grad_norm": 1.1339951753616333, + "learning_rate": 6.135552913198573e-06, + "loss": 0.4258, + "step": 1631 + }, + { + "epoch": 3.8777876895628904, + "grad_norm": 0.9762992858886719, + "learning_rate": 6.133174791914388e-06, + "loss": 0.4037, + "step": 1632 + }, + { + "epoch": 3.88016651798989, + "grad_norm": 0.840178906917572, + "learning_rate": 6.1307966706302025e-06, + "loss": 0.3486, + "step": 1633 + }, + { + "epoch": 3.8825453464168898, + "grad_norm": 0.9195915460586548, + "learning_rate": 6.128418549346017e-06, + "loss": 0.35, + "step": 1634 + }, + { + "epoch": 3.8849241748438894, + "grad_norm": 1.0214698314666748, + "learning_rate": 6.126040428061832e-06, + "loss": 0.4089, + "step": 1635 + }, + { + "epoch": 3.887303003270889, + "grad_norm": 0.9909235239028931, + "learning_rate": 6.1236623067776455e-06, + "loss": 0.3957, + "step": 1636 + }, + { + "epoch": 3.8896818316978887, + "grad_norm": 1.0571664571762085, + "learning_rate": 6.12128418549346e-06, + "loss": 0.4197, + "step": 1637 + }, + { + "epoch": 3.8920606601248884, + "grad_norm": 1.007819414138794, + "learning_rate": 6.1189060642092755e-06, + "loss": 0.4292, + "step": 1638 + }, + { + "epoch": 3.894439488551888, + "grad_norm": 1.3624173402786255, + "learning_rate": 6.11652794292509e-06, + "loss": 0.4429, + "step": 1639 + }, + { + "epoch": 3.896818316978888, + "grad_norm": 1.0355406999588013, + "learning_rate": 6.114149821640905e-06, + "loss": 0.3862, + "step": 1640 + }, + { + "epoch": 3.8991971454058874, + "grad_norm": 0.8824178576469421, + "learning_rate": 6.111771700356719e-06, + "loss": 0.3329, + "step": 1641 + }, + { + "epoch": 3.9015759738328875, + "grad_norm": 1.0744909048080444, + "learning_rate": 6.109393579072533e-06, + "loss": 0.3848, + "step": 1642 + }, + { + "epoch": 3.903954802259887, + "grad_norm": 1.1089926958084106, + "learning_rate": 6.107015457788348e-06, + "loss": 0.3851, + "step": 1643 + }, + { + "epoch": 3.9063336306868868, + "grad_norm": 0.9915771484375, + "learning_rate": 6.104637336504162e-06, + "loss": 0.3335, + "step": 1644 + }, + { + "epoch": 3.9087124591138864, + "grad_norm": 1.0598069429397583, + "learning_rate": 6.102259215219977e-06, + "loss": 0.4458, + "step": 1645 + }, + { + "epoch": 3.911091287540886, + "grad_norm": 1.19451904296875, + "learning_rate": 6.099881093935791e-06, + "loss": 0.3767, + "step": 1646 + }, + { + "epoch": 3.9134701159678857, + "grad_norm": 1.2057572603225708, + "learning_rate": 6.097502972651605e-06, + "loss": 0.3306, + "step": 1647 + }, + { + "epoch": 3.9158489443948854, + "grad_norm": 1.018324375152588, + "learning_rate": 6.09512485136742e-06, + "loss": 0.3969, + "step": 1648 + }, + { + "epoch": 3.918227772821885, + "grad_norm": 0.9551764726638794, + "learning_rate": 6.0927467300832346e-06, + "loss": 0.3564, + "step": 1649 + }, + { + "epoch": 3.9206066012488847, + "grad_norm": 1.2196859121322632, + "learning_rate": 6.090368608799049e-06, + "loss": 0.4171, + "step": 1650 + }, + { + "epoch": 3.9206066012488847, + "eval_loss": 0.43215590715408325, + "eval_runtime": 23.1603, + "eval_samples_per_second": 32.297, + "eval_steps_per_second": 16.148, + "step": 1650 + }, + { + "epoch": 3.922985429675885, + "grad_norm": 0.9969528317451477, + "learning_rate": 6.087990487514863e-06, + "loss": 0.3759, + "step": 1651 + }, + { + "epoch": 3.9253642581028845, + "grad_norm": 1.336701512336731, + "learning_rate": 6.085612366230678e-06, + "loss": 0.4623, + "step": 1652 + }, + { + "epoch": 3.927743086529884, + "grad_norm": 1.0124858617782593, + "learning_rate": 6.083234244946493e-06, + "loss": 0.4201, + "step": 1653 + }, + { + "epoch": 3.930121914956884, + "grad_norm": 1.1042088270187378, + "learning_rate": 6.080856123662308e-06, + "loss": 0.3461, + "step": 1654 + }, + { + "epoch": 3.9325007433838834, + "grad_norm": 1.1226118803024292, + "learning_rate": 6.078478002378122e-06, + "loss": 0.4981, + "step": 1655 + }, + { + "epoch": 3.934879571810883, + "grad_norm": 1.3208426237106323, + "learning_rate": 6.076099881093937e-06, + "loss": 0.4006, + "step": 1656 + }, + { + "epoch": 3.9372584002378828, + "grad_norm": 0.9814326763153076, + "learning_rate": 6.073721759809751e-06, + "loss": 0.4038, + "step": 1657 + }, + { + "epoch": 3.9396372286648824, + "grad_norm": 1.16126549243927, + "learning_rate": 6.071343638525565e-06, + "loss": 0.382, + "step": 1658 + }, + { + "epoch": 3.942016057091882, + "grad_norm": 1.1452785730361938, + "learning_rate": 6.06896551724138e-06, + "loss": 0.3593, + "step": 1659 + }, + { + "epoch": 3.944394885518882, + "grad_norm": 0.8712359666824341, + "learning_rate": 6.0665873959571944e-06, + "loss": 0.3527, + "step": 1660 + }, + { + "epoch": 3.9467737139458814, + "grad_norm": 1.1528770923614502, + "learning_rate": 6.064209274673008e-06, + "loss": 0.402, + "step": 1661 + }, + { + "epoch": 3.9491525423728815, + "grad_norm": 1.0356228351593018, + "learning_rate": 6.061831153388823e-06, + "loss": 0.4075, + "step": 1662 + }, + { + "epoch": 3.951531370799881, + "grad_norm": 0.9766412377357483, + "learning_rate": 6.059453032104637e-06, + "loss": 0.3743, + "step": 1663 + }, + { + "epoch": 3.953910199226881, + "grad_norm": 1.0192891359329224, + "learning_rate": 6.057074910820452e-06, + "loss": 0.3923, + "step": 1664 + }, + { + "epoch": 3.9562890276538805, + "grad_norm": 1.078993797302246, + "learning_rate": 6.0546967895362675e-06, + "loss": 0.3838, + "step": 1665 + }, + { + "epoch": 3.95866785608088, + "grad_norm": 0.991115927696228, + "learning_rate": 6.052318668252082e-06, + "loss": 0.333, + "step": 1666 + }, + { + "epoch": 3.96104668450788, + "grad_norm": 1.2205140590667725, + "learning_rate": 6.049940546967896e-06, + "loss": 0.3738, + "step": 1667 + }, + { + "epoch": 3.9634255129348794, + "grad_norm": 1.0416064262390137, + "learning_rate": 6.0475624256837105e-06, + "loss": 0.3563, + "step": 1668 + }, + { + "epoch": 3.9658043413618795, + "grad_norm": 1.1698068380355835, + "learning_rate": 6.045184304399525e-06, + "loss": 0.3951, + "step": 1669 + }, + { + "epoch": 3.9681831697888788, + "grad_norm": 1.247964859008789, + "learning_rate": 6.04280618311534e-06, + "loss": 0.425, + "step": 1670 + }, + { + "epoch": 3.970561998215879, + "grad_norm": 1.0218197107315063, + "learning_rate": 6.040428061831154e-06, + "loss": 0.3661, + "step": 1671 + }, + { + "epoch": 3.9729408266428785, + "grad_norm": 1.0488650798797607, + "learning_rate": 6.038049940546968e-06, + "loss": 0.3905, + "step": 1672 + }, + { + "epoch": 3.975319655069878, + "grad_norm": 1.0618765354156494, + "learning_rate": 6.035671819262783e-06, + "loss": 0.3952, + "step": 1673 + }, + { + "epoch": 3.977698483496878, + "grad_norm": 1.110557198524475, + "learning_rate": 6.033293697978597e-06, + "loss": 0.5601, + "step": 1674 + }, + { + "epoch": 3.9800773119238775, + "grad_norm": 1.1355289220809937, + "learning_rate": 6.030915576694412e-06, + "loss": 0.3883, + "step": 1675 + }, + { + "epoch": 3.982456140350877, + "grad_norm": 0.8546428680419922, + "learning_rate": 6.028537455410226e-06, + "loss": 0.3598, + "step": 1676 + }, + { + "epoch": 3.984834968777877, + "grad_norm": 1.016870141029358, + "learning_rate": 6.02615933412604e-06, + "loss": 0.3797, + "step": 1677 + }, + { + "epoch": 3.9872137972048765, + "grad_norm": 1.1178412437438965, + "learning_rate": 6.023781212841855e-06, + "loss": 0.4609, + "step": 1678 + }, + { + "epoch": 3.989592625631876, + "grad_norm": 0.9157519340515137, + "learning_rate": 6.02140309155767e-06, + "loss": 0.3796, + "step": 1679 + }, + { + "epoch": 3.991971454058876, + "grad_norm": 0.9614365100860596, + "learning_rate": 6.019024970273485e-06, + "loss": 0.4171, + "step": 1680 + }, + { + "epoch": 3.994350282485876, + "grad_norm": 1.103972315788269, + "learning_rate": 6.0166468489892996e-06, + "loss": 0.3171, + "step": 1681 + }, + { + "epoch": 3.9967291109128755, + "grad_norm": 1.131468415260315, + "learning_rate": 6.014268727705113e-06, + "loss": 0.3356, + "step": 1682 + }, + { + "epoch": 3.999107939339875, + "grad_norm": 1.1129132509231567, + "learning_rate": 6.011890606420928e-06, + "loss": 0.3833, + "step": 1683 + }, + { + "epoch": 4.0, + "grad_norm": 2.2083308696746826, + "learning_rate": 6.0095124851367426e-06, + "loss": 0.3752, + "step": 1684 + }, + { + "epoch": 4.002378828427, + "grad_norm": 0.973565936088562, + "learning_rate": 6.007134363852557e-06, + "loss": 0.3709, + "step": 1685 + }, + { + "epoch": 4.004757656853999, + "grad_norm": 0.9815882444381714, + "learning_rate": 6.004756242568372e-06, + "loss": 0.3284, + "step": 1686 + }, + { + "epoch": 4.007136485280999, + "grad_norm": 0.9500920176506042, + "learning_rate": 6.0023781212841855e-06, + "loss": 0.3965, + "step": 1687 + }, + { + "epoch": 4.009515313707999, + "grad_norm": 0.9478591680526733, + "learning_rate": 6e-06, + "loss": 0.3425, + "step": 1688 + }, + { + "epoch": 4.011894142134999, + "grad_norm": 0.996497392654419, + "learning_rate": 5.997621878715815e-06, + "loss": 0.4157, + "step": 1689 + }, + { + "epoch": 4.014272970561998, + "grad_norm": 1.0207120180130005, + "learning_rate": 5.995243757431629e-06, + "loss": 0.3734, + "step": 1690 + }, + { + "epoch": 4.016651798988998, + "grad_norm": 0.9969112277030945, + "learning_rate": 5.992865636147443e-06, + "loss": 0.3227, + "step": 1691 + }, + { + "epoch": 4.019030627415997, + "grad_norm": 1.0664403438568115, + "learning_rate": 5.990487514863258e-06, + "loss": 0.4472, + "step": 1692 + }, + { + "epoch": 4.021409455842997, + "grad_norm": 1.197433352470398, + "learning_rate": 5.988109393579073e-06, + "loss": 0.4159, + "step": 1693 + }, + { + "epoch": 4.0237882842699975, + "grad_norm": 1.0851536989212036, + "learning_rate": 5.985731272294888e-06, + "loss": 0.4023, + "step": 1694 + }, + { + "epoch": 4.026167112696997, + "grad_norm": 1.1099220514297485, + "learning_rate": 5.9833531510107024e-06, + "loss": 0.4014, + "step": 1695 + }, + { + "epoch": 4.028545941123997, + "grad_norm": 0.8995441198348999, + "learning_rate": 5.980975029726517e-06, + "loss": 0.3264, + "step": 1696 + }, + { + "epoch": 4.030924769550996, + "grad_norm": 1.0133134126663208, + "learning_rate": 5.978596908442331e-06, + "loss": 0.3689, + "step": 1697 + }, + { + "epoch": 4.033303597977996, + "grad_norm": 1.0279959440231323, + "learning_rate": 5.976218787158145e-06, + "loss": 0.3603, + "step": 1698 + }, + { + "epoch": 4.035682426404995, + "grad_norm": 1.0391390323638916, + "learning_rate": 5.97384066587396e-06, + "loss": 0.3754, + "step": 1699 + }, + { + "epoch": 4.038061254831995, + "grad_norm": 0.966560423374176, + "learning_rate": 5.971462544589775e-06, + "loss": 0.4055, + "step": 1700 + }, + { + "epoch": 4.038061254831995, + "eval_loss": 0.4315594434738159, + "eval_runtime": 23.3724, + "eval_samples_per_second": 32.004, + "eval_steps_per_second": 16.002, + "step": 1700 + }, + { + "epoch": 4.040440083258995, + "grad_norm": 1.249593734741211, + "learning_rate": 5.969084423305589e-06, + "loss": 0.4272, + "step": 1701 + }, + { + "epoch": 4.042818911685995, + "grad_norm": 0.9564142823219299, + "learning_rate": 5.966706302021403e-06, + "loss": 0.3797, + "step": 1702 + }, + { + "epoch": 4.045197740112994, + "grad_norm": 1.0791631937026978, + "learning_rate": 5.964328180737218e-06, + "loss": 0.355, + "step": 1703 + }, + { + "epoch": 4.047576568539994, + "grad_norm": 1.0557457208633423, + "learning_rate": 5.961950059453032e-06, + "loss": 0.3951, + "step": 1704 + }, + { + "epoch": 4.049955396966994, + "grad_norm": 1.1637648344039917, + "learning_rate": 5.959571938168847e-06, + "loss": 0.4252, + "step": 1705 + }, + { + "epoch": 4.052334225393993, + "grad_norm": 1.1711636781692505, + "learning_rate": 5.957193816884662e-06, + "loss": 0.3462, + "step": 1706 + }, + { + "epoch": 4.0547130538209935, + "grad_norm": 1.0236165523529053, + "learning_rate": 5.954815695600477e-06, + "loss": 0.3213, + "step": 1707 + }, + { + "epoch": 4.057091882247993, + "grad_norm": 1.0058701038360596, + "learning_rate": 5.952437574316291e-06, + "loss": 0.34, + "step": 1708 + }, + { + "epoch": 4.059470710674993, + "grad_norm": 1.1158058643341064, + "learning_rate": 5.950059453032105e-06, + "loss": 0.3966, + "step": 1709 + }, + { + "epoch": 4.061849539101992, + "grad_norm": 0.9432827234268188, + "learning_rate": 5.94768133174792e-06, + "loss": 0.319, + "step": 1710 + }, + { + "epoch": 4.064228367528992, + "grad_norm": 1.0525660514831543, + "learning_rate": 5.9453032104637345e-06, + "loss": 0.3793, + "step": 1711 + }, + { + "epoch": 4.066607195955991, + "grad_norm": 1.133090853691101, + "learning_rate": 5.942925089179548e-06, + "loss": 0.3993, + "step": 1712 + }, + { + "epoch": 4.068986024382991, + "grad_norm": 1.136536717414856, + "learning_rate": 5.940546967895363e-06, + "loss": 0.3062, + "step": 1713 + }, + { + "epoch": 4.0713648528099915, + "grad_norm": 1.091807246208191, + "learning_rate": 5.9381688466111775e-06, + "loss": 0.3259, + "step": 1714 + }, + { + "epoch": 4.073743681236991, + "grad_norm": 0.9638052582740784, + "learning_rate": 5.935790725326992e-06, + "loss": 0.3462, + "step": 1715 + }, + { + "epoch": 4.076122509663991, + "grad_norm": 1.1180860996246338, + "learning_rate": 5.933412604042807e-06, + "loss": 0.3552, + "step": 1716 + }, + { + "epoch": 4.07850133809099, + "grad_norm": 1.2410954236984253, + "learning_rate": 5.9310344827586205e-06, + "loss": 0.414, + "step": 1717 + }, + { + "epoch": 4.08088016651799, + "grad_norm": 1.2453638315200806, + "learning_rate": 5.928656361474435e-06, + "loss": 0.356, + "step": 1718 + }, + { + "epoch": 4.083258994944989, + "grad_norm": 1.0123298168182373, + "learning_rate": 5.92627824019025e-06, + "loss": 0.3224, + "step": 1719 + }, + { + "epoch": 4.085637823371989, + "grad_norm": 1.1414096355438232, + "learning_rate": 5.923900118906065e-06, + "loss": 0.3766, + "step": 1720 + }, + { + "epoch": 4.088016651798989, + "grad_norm": 1.0205833911895752, + "learning_rate": 5.92152199762188e-06, + "loss": 0.4007, + "step": 1721 + }, + { + "epoch": 4.090395480225989, + "grad_norm": 0.9781925082206726, + "learning_rate": 5.919143876337694e-06, + "loss": 0.3732, + "step": 1722 + }, + { + "epoch": 4.092774308652988, + "grad_norm": 1.037003755569458, + "learning_rate": 5.916765755053508e-06, + "loss": 0.3535, + "step": 1723 + }, + { + "epoch": 4.095153137079988, + "grad_norm": 1.1102654933929443, + "learning_rate": 5.914387633769323e-06, + "loss": 0.4401, + "step": 1724 + }, + { + "epoch": 4.097531965506988, + "grad_norm": 1.2157264947891235, + "learning_rate": 5.912009512485137e-06, + "loss": 0.392, + "step": 1725 + }, + { + "epoch": 4.099910793933987, + "grad_norm": 1.0958483219146729, + "learning_rate": 5.909631391200952e-06, + "loss": 0.3078, + "step": 1726 + }, + { + "epoch": 4.1022896223609875, + "grad_norm": 1.2432295083999634, + "learning_rate": 5.907253269916767e-06, + "loss": 0.3948, + "step": 1727 + }, + { + "epoch": 4.104668450787987, + "grad_norm": 1.1019243001937866, + "learning_rate": 5.90487514863258e-06, + "loss": 0.4006, + "step": 1728 + }, + { + "epoch": 4.107047279214987, + "grad_norm": 1.2110435962677002, + "learning_rate": 5.902497027348395e-06, + "loss": 0.3742, + "step": 1729 + }, + { + "epoch": 4.109426107641986, + "grad_norm": 1.0437052249908447, + "learning_rate": 5.9001189060642096e-06, + "loss": 0.3806, + "step": 1730 + }, + { + "epoch": 4.111804936068986, + "grad_norm": 1.1280004978179932, + "learning_rate": 5.897740784780024e-06, + "loss": 0.3607, + "step": 1731 + }, + { + "epoch": 4.114183764495985, + "grad_norm": 1.1035478115081787, + "learning_rate": 5.895362663495838e-06, + "loss": 0.3351, + "step": 1732 + }, + { + "epoch": 4.116562592922985, + "grad_norm": 1.1649484634399414, + "learning_rate": 5.8929845422116526e-06, + "loss": 0.3719, + "step": 1733 + }, + { + "epoch": 4.1189414213499855, + "grad_norm": 0.9625906944274902, + "learning_rate": 5.890606420927468e-06, + "loss": 0.3836, + "step": 1734 + }, + { + "epoch": 4.121320249776985, + "grad_norm": 1.1445996761322021, + "learning_rate": 5.888228299643283e-06, + "loss": 0.4193, + "step": 1735 + }, + { + "epoch": 4.123699078203985, + "grad_norm": 1.14402437210083, + "learning_rate": 5.885850178359097e-06, + "loss": 0.41, + "step": 1736 + }, + { + "epoch": 4.126077906630984, + "grad_norm": 1.066888689994812, + "learning_rate": 5.883472057074912e-06, + "loss": 0.4043, + "step": 1737 + }, + { + "epoch": 4.128456735057984, + "grad_norm": 0.9996502995491028, + "learning_rate": 5.881093935790726e-06, + "loss": 0.4061, + "step": 1738 + }, + { + "epoch": 4.130835563484983, + "grad_norm": 1.0126618146896362, + "learning_rate": 5.87871581450654e-06, + "loss": 0.4217, + "step": 1739 + }, + { + "epoch": 4.1332143919119835, + "grad_norm": 1.1604323387145996, + "learning_rate": 5.876337693222355e-06, + "loss": 0.387, + "step": 1740 + }, + { + "epoch": 4.135593220338983, + "grad_norm": 1.1115528345108032, + "learning_rate": 5.8739595719381694e-06, + "loss": 0.3267, + "step": 1741 + }, + { + "epoch": 4.137972048765983, + "grad_norm": 1.0919880867004395, + "learning_rate": 5.871581450653984e-06, + "loss": 0.365, + "step": 1742 + }, + { + "epoch": 4.140350877192983, + "grad_norm": 0.9594374299049377, + "learning_rate": 5.869203329369798e-06, + "loss": 0.3373, + "step": 1743 + }, + { + "epoch": 4.142729705619982, + "grad_norm": 1.2847731113433838, + "learning_rate": 5.8668252080856124e-06, + "loss": 0.4684, + "step": 1744 + }, + { + "epoch": 4.145108534046982, + "grad_norm": 0.9923352599143982, + "learning_rate": 5.864447086801427e-06, + "loss": 0.3363, + "step": 1745 + }, + { + "epoch": 4.147487362473981, + "grad_norm": 0.9537487030029297, + "learning_rate": 5.862068965517242e-06, + "loss": 0.3035, + "step": 1746 + }, + { + "epoch": 4.1498661909009815, + "grad_norm": 1.134384274482727, + "learning_rate": 5.859690844233057e-06, + "loss": 0.3317, + "step": 1747 + }, + { + "epoch": 4.152245019327981, + "grad_norm": 0.9351004362106323, + "learning_rate": 5.857312722948872e-06, + "loss": 0.3615, + "step": 1748 + }, + { + "epoch": 4.154623847754981, + "grad_norm": 1.1589502096176147, + "learning_rate": 5.8549346016646855e-06, + "loss": 0.4273, + "step": 1749 + }, + { + "epoch": 4.15700267618198, + "grad_norm": 1.0163840055465698, + "learning_rate": 5.8525564803805e-06, + "loss": 0.3228, + "step": 1750 + }, + { + "epoch": 4.15700267618198, + "eval_loss": 0.43061861395835876, + "eval_runtime": 23.3779, + "eval_samples_per_second": 31.996, + "eval_steps_per_second": 15.998, + "step": 1750 + }, + { + "epoch": 4.15938150460898, + "grad_norm": 1.1810320615768433, + "learning_rate": 5.850178359096315e-06, + "loss": 0.351, + "step": 1751 + }, + { + "epoch": 4.161760333035979, + "grad_norm": 1.185821771621704, + "learning_rate": 5.847800237812129e-06, + "loss": 0.4408, + "step": 1752 + }, + { + "epoch": 4.1641391614629795, + "grad_norm": 1.169162392616272, + "learning_rate": 5.845422116527943e-06, + "loss": 0.3001, + "step": 1753 + }, + { + "epoch": 4.16651798988998, + "grad_norm": 0.9837247729301453, + "learning_rate": 5.843043995243758e-06, + "loss": 0.3521, + "step": 1754 + }, + { + "epoch": 4.168896818316979, + "grad_norm": 1.0876203775405884, + "learning_rate": 5.840665873959572e-06, + "loss": 0.3343, + "step": 1755 + }, + { + "epoch": 4.171275646743979, + "grad_norm": 1.0030754804611206, + "learning_rate": 5.838287752675387e-06, + "loss": 0.3099, + "step": 1756 + }, + { + "epoch": 4.173654475170978, + "grad_norm": 1.3043830394744873, + "learning_rate": 5.8359096313912015e-06, + "loss": 0.3482, + "step": 1757 + }, + { + "epoch": 4.176033303597978, + "grad_norm": 1.0944795608520508, + "learning_rate": 5.833531510107015e-06, + "loss": 0.4264, + "step": 1758 + }, + { + "epoch": 4.178412132024977, + "grad_norm": 1.0918995141983032, + "learning_rate": 5.83115338882283e-06, + "loss": 0.3318, + "step": 1759 + }, + { + "epoch": 4.1807909604519775, + "grad_norm": 1.0467015504837036, + "learning_rate": 5.8287752675386445e-06, + "loss": 0.3685, + "step": 1760 + }, + { + "epoch": 4.183169788878977, + "grad_norm": 1.2380188703536987, + "learning_rate": 5.82639714625446e-06, + "loss": 0.4044, + "step": 1761 + }, + { + "epoch": 4.185548617305977, + "grad_norm": 1.2682744264602661, + "learning_rate": 5.824019024970275e-06, + "loss": 0.4008, + "step": 1762 + }, + { + "epoch": 4.187927445732977, + "grad_norm": 1.3744791746139526, + "learning_rate": 5.821640903686089e-06, + "loss": 0.4445, + "step": 1763 + }, + { + "epoch": 4.190306274159976, + "grad_norm": 0.9570406079292297, + "learning_rate": 5.819262782401903e-06, + "loss": 0.3268, + "step": 1764 + }, + { + "epoch": 4.192685102586976, + "grad_norm": 1.1156271696090698, + "learning_rate": 5.8168846611177176e-06, + "loss": 0.4249, + "step": 1765 + }, + { + "epoch": 4.1950639310139755, + "grad_norm": 1.0250122547149658, + "learning_rate": 5.814506539833532e-06, + "loss": 0.3643, + "step": 1766 + }, + { + "epoch": 4.197442759440976, + "grad_norm": 1.0883069038391113, + "learning_rate": 5.812128418549347e-06, + "loss": 0.3632, + "step": 1767 + }, + { + "epoch": 4.199821587867975, + "grad_norm": 1.2012964487075806, + "learning_rate": 5.8097502972651606e-06, + "loss": 0.3649, + "step": 1768 + }, + { + "epoch": 4.202200416294975, + "grad_norm": 1.0557911396026611, + "learning_rate": 5.807372175980975e-06, + "loss": 0.4027, + "step": 1769 + }, + { + "epoch": 4.204579244721974, + "grad_norm": 1.2128593921661377, + "learning_rate": 5.80499405469679e-06, + "loss": 0.4355, + "step": 1770 + }, + { + "epoch": 4.206958073148974, + "grad_norm": 1.3021197319030762, + "learning_rate": 5.802615933412604e-06, + "loss": 0.4508, + "step": 1771 + }, + { + "epoch": 4.209336901575973, + "grad_norm": 1.2116872072219849, + "learning_rate": 5.800237812128419e-06, + "loss": 0.3457, + "step": 1772 + }, + { + "epoch": 4.2117157300029735, + "grad_norm": 1.1973525285720825, + "learning_rate": 5.797859690844233e-06, + "loss": 0.3941, + "step": 1773 + }, + { + "epoch": 4.214094558429974, + "grad_norm": 1.0983175039291382, + "learning_rate": 5.795481569560047e-06, + "loss": 0.318, + "step": 1774 + }, + { + "epoch": 4.216473386856973, + "grad_norm": 0.997664213180542, + "learning_rate": 5.793103448275863e-06, + "loss": 0.3196, + "step": 1775 + }, + { + "epoch": 4.218852215283973, + "grad_norm": 1.0934463739395142, + "learning_rate": 5.7907253269916774e-06, + "loss": 0.3188, + "step": 1776 + }, + { + "epoch": 4.221231043710972, + "grad_norm": 1.1184375286102295, + "learning_rate": 5.788347205707492e-06, + "loss": 0.3663, + "step": 1777 + }, + { + "epoch": 4.223609872137972, + "grad_norm": 1.071589469909668, + "learning_rate": 5.785969084423307e-06, + "loss": 0.3543, + "step": 1778 + }, + { + "epoch": 4.2259887005649714, + "grad_norm": 1.1968176364898682, + "learning_rate": 5.7835909631391204e-06, + "loss": 0.436, + "step": 1779 + }, + { + "epoch": 4.2283675289919715, + "grad_norm": 1.185860276222229, + "learning_rate": 5.781212841854935e-06, + "loss": 0.3801, + "step": 1780 + }, + { + "epoch": 4.230746357418971, + "grad_norm": 1.1128686666488647, + "learning_rate": 5.77883472057075e-06, + "loss": 0.3868, + "step": 1781 + }, + { + "epoch": 4.233125185845971, + "grad_norm": 1.0921882390975952, + "learning_rate": 5.776456599286564e-06, + "loss": 0.3752, + "step": 1782 + }, + { + "epoch": 4.235504014272971, + "grad_norm": 1.0837870836257935, + "learning_rate": 5.774078478002378e-06, + "loss": 0.3996, + "step": 1783 + }, + { + "epoch": 4.23788284269997, + "grad_norm": 0.9594762921333313, + "learning_rate": 5.771700356718193e-06, + "loss": 0.3789, + "step": 1784 + }, + { + "epoch": 4.24026167112697, + "grad_norm": 1.0875896215438843, + "learning_rate": 5.769322235434007e-06, + "loss": 0.3487, + "step": 1785 + }, + { + "epoch": 4.2426404995539695, + "grad_norm": 1.0368695259094238, + "learning_rate": 5.766944114149822e-06, + "loss": 0.3998, + "step": 1786 + }, + { + "epoch": 4.24501932798097, + "grad_norm": 1.0888853073120117, + "learning_rate": 5.7645659928656365e-06, + "loss": 0.336, + "step": 1787 + }, + { + "epoch": 4.247398156407969, + "grad_norm": 1.0571759939193726, + "learning_rate": 5.762187871581452e-06, + "loss": 0.363, + "step": 1788 + }, + { + "epoch": 4.249776984834969, + "grad_norm": 1.1709622144699097, + "learning_rate": 5.759809750297266e-06, + "loss": 0.3954, + "step": 1789 + }, + { + "epoch": 4.252155813261968, + "grad_norm": 1.2918146848678589, + "learning_rate": 5.75743162901308e-06, + "loss": 0.3906, + "step": 1790 + }, + { + "epoch": 4.254534641688968, + "grad_norm": 1.269065499305725, + "learning_rate": 5.755053507728895e-06, + "loss": 0.378, + "step": 1791 + }, + { + "epoch": 4.256913470115968, + "grad_norm": 1.1712253093719482, + "learning_rate": 5.7526753864447095e-06, + "loss": 0.3619, + "step": 1792 + }, + { + "epoch": 4.2592922985429675, + "grad_norm": 1.139806866645813, + "learning_rate": 5.750297265160524e-06, + "loss": 0.4189, + "step": 1793 + }, + { + "epoch": 4.261671126969968, + "grad_norm": 1.0285319089889526, + "learning_rate": 5.747919143876338e-06, + "loss": 0.3414, + "step": 1794 + }, + { + "epoch": 4.264049955396967, + "grad_norm": 1.1717127561569214, + "learning_rate": 5.7455410225921525e-06, + "loss": 0.3838, + "step": 1795 + }, + { + "epoch": 4.266428783823967, + "grad_norm": 0.9931778907775879, + "learning_rate": 5.743162901307967e-06, + "loss": 0.3943, + "step": 1796 + }, + { + "epoch": 4.268807612250966, + "grad_norm": 1.169741153717041, + "learning_rate": 5.740784780023782e-06, + "loss": 0.3601, + "step": 1797 + }, + { + "epoch": 4.271186440677966, + "grad_norm": 1.0344288349151611, + "learning_rate": 5.7384066587395955e-06, + "loss": 0.3917, + "step": 1798 + }, + { + "epoch": 4.2735652691049655, + "grad_norm": 1.0947175025939941, + "learning_rate": 5.73602853745541e-06, + "loss": 0.3535, + "step": 1799 + }, + { + "epoch": 4.275944097531966, + "grad_norm": 0.9977245330810547, + "learning_rate": 5.733650416171225e-06, + "loss": 0.4121, + "step": 1800 + }, + { + "epoch": 4.275944097531966, + "eval_loss": 0.42980003356933594, + "eval_runtime": 23.3989, + "eval_samples_per_second": 31.967, + "eval_steps_per_second": 15.984, + "step": 1800 + }, + { + "epoch": 4.278322925958965, + "grad_norm": 1.171673059463501, + "learning_rate": 5.731272294887039e-06, + "loss": 0.3505, + "step": 1801 + }, + { + "epoch": 4.280701754385965, + "grad_norm": 1.1202528476715088, + "learning_rate": 5.728894173602855e-06, + "loss": 0.3801, + "step": 1802 + }, + { + "epoch": 4.283080582812965, + "grad_norm": 1.1559094190597534, + "learning_rate": 5.726516052318669e-06, + "loss": 0.3542, + "step": 1803 + }, + { + "epoch": 4.285459411239964, + "grad_norm": 0.9428724646568298, + "learning_rate": 5.724137931034483e-06, + "loss": 0.3059, + "step": 1804 + }, + { + "epoch": 4.287838239666964, + "grad_norm": 1.031258463859558, + "learning_rate": 5.721759809750298e-06, + "loss": 0.3264, + "step": 1805 + }, + { + "epoch": 4.2902170680939635, + "grad_norm": 0.9775452613830566, + "learning_rate": 5.719381688466112e-06, + "loss": 0.3723, + "step": 1806 + }, + { + "epoch": 4.292595896520964, + "grad_norm": 1.010990858078003, + "learning_rate": 5.717003567181927e-06, + "loss": 0.3227, + "step": 1807 + }, + { + "epoch": 4.294974724947963, + "grad_norm": 1.2755472660064697, + "learning_rate": 5.714625445897742e-06, + "loss": 0.4378, + "step": 1808 + }, + { + "epoch": 4.297353553374963, + "grad_norm": 1.0517765283584595, + "learning_rate": 5.712247324613555e-06, + "loss": 0.393, + "step": 1809 + }, + { + "epoch": 4.299732381801962, + "grad_norm": 1.066077709197998, + "learning_rate": 5.70986920332937e-06, + "loss": 0.3726, + "step": 1810 + }, + { + "epoch": 4.302111210228962, + "grad_norm": 1.14970862865448, + "learning_rate": 5.707491082045185e-06, + "loss": 0.3355, + "step": 1811 + }, + { + "epoch": 4.3044900386559615, + "grad_norm": 1.1635456085205078, + "learning_rate": 5.705112960760999e-06, + "loss": 0.4442, + "step": 1812 + }, + { + "epoch": 4.306868867082962, + "grad_norm": 1.1156160831451416, + "learning_rate": 5.702734839476814e-06, + "loss": 0.2712, + "step": 1813 + }, + { + "epoch": 4.309247695509962, + "grad_norm": 1.0510598421096802, + "learning_rate": 5.7003567181926276e-06, + "loss": 0.3424, + "step": 1814 + }, + { + "epoch": 4.311626523936961, + "grad_norm": 0.9448025226593018, + "learning_rate": 5.697978596908443e-06, + "loss": 0.3242, + "step": 1815 + }, + { + "epoch": 4.314005352363961, + "grad_norm": 1.0749479532241821, + "learning_rate": 5.695600475624258e-06, + "loss": 0.3871, + "step": 1816 + }, + { + "epoch": 4.31638418079096, + "grad_norm": 1.1467669010162354, + "learning_rate": 5.693222354340072e-06, + "loss": 0.3394, + "step": 1817 + }, + { + "epoch": 4.31876300921796, + "grad_norm": 1.1261898279190063, + "learning_rate": 5.690844233055887e-06, + "loss": 0.3283, + "step": 1818 + }, + { + "epoch": 4.3211418376449595, + "grad_norm": 1.1866028308868408, + "learning_rate": 5.688466111771701e-06, + "loss": 0.3648, + "step": 1819 + }, + { + "epoch": 4.32352066607196, + "grad_norm": 1.0991865396499634, + "learning_rate": 5.686087990487515e-06, + "loss": 0.375, + "step": 1820 + }, + { + "epoch": 4.325899494498959, + "grad_norm": 1.27780020236969, + "learning_rate": 5.68370986920333e-06, + "loss": 0.3626, + "step": 1821 + }, + { + "epoch": 4.328278322925959, + "grad_norm": 1.0596758127212524, + "learning_rate": 5.6813317479191445e-06, + "loss": 0.3542, + "step": 1822 + }, + { + "epoch": 4.330657151352959, + "grad_norm": 1.0962541103363037, + "learning_rate": 5.678953626634959e-06, + "loss": 0.4122, + "step": 1823 + }, + { + "epoch": 4.333035979779958, + "grad_norm": 1.1500521898269653, + "learning_rate": 5.676575505350773e-06, + "loss": 0.3987, + "step": 1824 + }, + { + "epoch": 4.335414808206958, + "grad_norm": 1.162178874015808, + "learning_rate": 5.6741973840665874e-06, + "loss": 0.4072, + "step": 1825 + }, + { + "epoch": 4.337793636633958, + "grad_norm": 1.2090063095092773, + "learning_rate": 5.671819262782402e-06, + "loss": 0.4065, + "step": 1826 + }, + { + "epoch": 4.340172465060958, + "grad_norm": 1.1765305995941162, + "learning_rate": 5.669441141498217e-06, + "loss": 0.3577, + "step": 1827 + }, + { + "epoch": 4.342551293487957, + "grad_norm": 1.1549410820007324, + "learning_rate": 5.667063020214031e-06, + "loss": 0.4037, + "step": 1828 + }, + { + "epoch": 4.344930121914957, + "grad_norm": 0.9808195233345032, + "learning_rate": 5.664684898929847e-06, + "loss": 0.3413, + "step": 1829 + }, + { + "epoch": 4.347308950341956, + "grad_norm": 1.0862500667572021, + "learning_rate": 5.6623067776456605e-06, + "loss": 0.3435, + "step": 1830 + }, + { + "epoch": 4.349687778768956, + "grad_norm": 1.1457101106643677, + "learning_rate": 5.659928656361475e-06, + "loss": 0.4042, + "step": 1831 + }, + { + "epoch": 4.352066607195956, + "grad_norm": 1.2891385555267334, + "learning_rate": 5.65755053507729e-06, + "loss": 0.4579, + "step": 1832 + }, + { + "epoch": 4.354445435622956, + "grad_norm": 1.0207252502441406, + "learning_rate": 5.655172413793104e-06, + "loss": 0.3277, + "step": 1833 + }, + { + "epoch": 4.356824264049956, + "grad_norm": 1.1461669206619263, + "learning_rate": 5.652794292508919e-06, + "loss": 0.3525, + "step": 1834 + }, + { + "epoch": 4.359203092476955, + "grad_norm": 1.0668869018554688, + "learning_rate": 5.650416171224733e-06, + "loss": 0.3534, + "step": 1835 + }, + { + "epoch": 4.361581920903955, + "grad_norm": 1.123878002166748, + "learning_rate": 5.648038049940547e-06, + "loss": 0.3018, + "step": 1836 + }, + { + "epoch": 4.363960749330954, + "grad_norm": 1.0888853073120117, + "learning_rate": 5.645659928656362e-06, + "loss": 0.3482, + "step": 1837 + }, + { + "epoch": 4.366339577757954, + "grad_norm": 1.1917340755462646, + "learning_rate": 5.6432818073721765e-06, + "loss": 0.3875, + "step": 1838 + }, + { + "epoch": 4.3687184061849536, + "grad_norm": 1.3374992609024048, + "learning_rate": 5.64090368608799e-06, + "loss": 0.416, + "step": 1839 + }, + { + "epoch": 4.371097234611954, + "grad_norm": 1.1784924268722534, + "learning_rate": 5.638525564803805e-06, + "loss": 0.4233, + "step": 1840 + }, + { + "epoch": 4.373476063038954, + "grad_norm": 1.102421760559082, + "learning_rate": 5.6361474435196195e-06, + "loss": 0.3484, + "step": 1841 + }, + { + "epoch": 4.375854891465953, + "grad_norm": 1.1543748378753662, + "learning_rate": 5.633769322235434e-06, + "loss": 0.3243, + "step": 1842 + }, + { + "epoch": 4.378233719892953, + "grad_norm": 1.0397084951400757, + "learning_rate": 5.63139120095125e-06, + "loss": 0.2967, + "step": 1843 + }, + { + "epoch": 4.380612548319952, + "grad_norm": 1.3203734159469604, + "learning_rate": 5.629013079667064e-06, + "loss": 0.3865, + "step": 1844 + }, + { + "epoch": 4.382991376746952, + "grad_norm": 1.1515718698501587, + "learning_rate": 5.626634958382878e-06, + "loss": 0.4421, + "step": 1845 + }, + { + "epoch": 4.385370205173952, + "grad_norm": 1.0689294338226318, + "learning_rate": 5.624256837098693e-06, + "loss": 0.3861, + "step": 1846 + }, + { + "epoch": 4.387749033600952, + "grad_norm": 1.2824926376342773, + "learning_rate": 5.621878715814507e-06, + "loss": 0.4011, + "step": 1847 + }, + { + "epoch": 4.390127862027951, + "grad_norm": 0.965262234210968, + "learning_rate": 5.619500594530322e-06, + "loss": 0.3176, + "step": 1848 + }, + { + "epoch": 4.392506690454951, + "grad_norm": 1.0601681470870972, + "learning_rate": 5.617122473246136e-06, + "loss": 0.3454, + "step": 1849 + }, + { + "epoch": 4.39488551888195, + "grad_norm": 1.1572606563568115, + "learning_rate": 5.61474435196195e-06, + "loss": 0.3267, + "step": 1850 + }, + { + "epoch": 4.39488551888195, + "eval_loss": 0.4298034608364105, + "eval_runtime": 23.4512, + "eval_samples_per_second": 31.896, + "eval_steps_per_second": 15.948, + "step": 1850 + }, + { + "epoch": 4.39726434730895, + "grad_norm": 1.153000831604004, + "learning_rate": 5.612366230677765e-06, + "loss": 0.3468, + "step": 1851 + }, + { + "epoch": 4.39964317573595, + "grad_norm": 1.0936633348464966, + "learning_rate": 5.609988109393579e-06, + "loss": 0.301, + "step": 1852 + }, + { + "epoch": 4.40202200416295, + "grad_norm": 1.0826858282089233, + "learning_rate": 5.607609988109394e-06, + "loss": 0.3454, + "step": 1853 + }, + { + "epoch": 4.40440083258995, + "grad_norm": 1.2001688480377197, + "learning_rate": 5.605231866825208e-06, + "loss": 0.405, + "step": 1854 + }, + { + "epoch": 4.406779661016949, + "grad_norm": 1.2433090209960938, + "learning_rate": 5.602853745541022e-06, + "loss": 0.4193, + "step": 1855 + }, + { + "epoch": 4.409158489443949, + "grad_norm": 1.3248966932296753, + "learning_rate": 5.600475624256838e-06, + "loss": 0.4281, + "step": 1856 + }, + { + "epoch": 4.411537317870948, + "grad_norm": 1.2866051197052002, + "learning_rate": 5.5980975029726525e-06, + "loss": 0.4245, + "step": 1857 + }, + { + "epoch": 4.413916146297948, + "grad_norm": 1.223488688468933, + "learning_rate": 5.595719381688467e-06, + "loss": 0.3839, + "step": 1858 + }, + { + "epoch": 4.416294974724948, + "grad_norm": 1.3462259769439697, + "learning_rate": 5.593341260404282e-06, + "loss": 0.3939, + "step": 1859 + }, + { + "epoch": 4.418673803151948, + "grad_norm": 1.25753915309906, + "learning_rate": 5.5909631391200954e-06, + "loss": 0.4181, + "step": 1860 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 1.1392059326171875, + "learning_rate": 5.58858501783591e-06, + "loss": 0.3369, + "step": 1861 + }, + { + "epoch": 4.423431460005947, + "grad_norm": 1.2304483652114868, + "learning_rate": 5.586206896551725e-06, + "loss": 0.3683, + "step": 1862 + }, + { + "epoch": 4.425810288432947, + "grad_norm": 1.1137241125106812, + "learning_rate": 5.583828775267539e-06, + "loss": 0.4322, + "step": 1863 + }, + { + "epoch": 4.428189116859946, + "grad_norm": 1.0442771911621094, + "learning_rate": 5.581450653983354e-06, + "loss": 0.4698, + "step": 1864 + }, + { + "epoch": 4.430567945286946, + "grad_norm": 0.9731234312057495, + "learning_rate": 5.579072532699168e-06, + "loss": 0.3495, + "step": 1865 + }, + { + "epoch": 4.432946773713946, + "grad_norm": 1.2755101919174194, + "learning_rate": 5.576694411414982e-06, + "loss": 0.4045, + "step": 1866 + }, + { + "epoch": 4.435325602140946, + "grad_norm": 1.1807687282562256, + "learning_rate": 5.574316290130797e-06, + "loss": 0.4029, + "step": 1867 + }, + { + "epoch": 4.437704430567945, + "grad_norm": 0.9842526912689209, + "learning_rate": 5.5719381688466115e-06, + "loss": 0.3471, + "step": 1868 + }, + { + "epoch": 4.440083258994945, + "grad_norm": 1.2282215356826782, + "learning_rate": 5.569560047562425e-06, + "loss": 0.3638, + "step": 1869 + }, + { + "epoch": 4.442462087421944, + "grad_norm": 1.116112470626831, + "learning_rate": 5.5671819262782415e-06, + "loss": 0.4092, + "step": 1870 + }, + { + "epoch": 4.444840915848944, + "grad_norm": 1.1269196271896362, + "learning_rate": 5.564803804994055e-06, + "loss": 0.4167, + "step": 1871 + }, + { + "epoch": 4.4472197442759445, + "grad_norm": 1.124638319015503, + "learning_rate": 5.56242568370987e-06, + "loss": 0.4137, + "step": 1872 + }, + { + "epoch": 4.449598572702944, + "grad_norm": 1.0812554359436035, + "learning_rate": 5.5600475624256845e-06, + "loss": 0.3627, + "step": 1873 + }, + { + "epoch": 4.451977401129944, + "grad_norm": 1.1244096755981445, + "learning_rate": 5.557669441141499e-06, + "loss": 0.3515, + "step": 1874 + }, + { + "epoch": 4.454356229556943, + "grad_norm": 1.1820361614227295, + "learning_rate": 5.555291319857313e-06, + "loss": 0.367, + "step": 1875 + }, + { + "epoch": 4.456735057983943, + "grad_norm": 1.1670238971710205, + "learning_rate": 5.5529131985731275e-06, + "loss": 0.3404, + "step": 1876 + }, + { + "epoch": 4.459113886410942, + "grad_norm": 1.2300233840942383, + "learning_rate": 5.550535077288942e-06, + "loss": 0.4149, + "step": 1877 + }, + { + "epoch": 4.461492714837942, + "grad_norm": 1.0117051601409912, + "learning_rate": 5.548156956004757e-06, + "loss": 0.3713, + "step": 1878 + }, + { + "epoch": 4.463871543264942, + "grad_norm": 1.146887183189392, + "learning_rate": 5.545778834720571e-06, + "loss": 0.3324, + "step": 1879 + }, + { + "epoch": 4.466250371691942, + "grad_norm": 1.0562372207641602, + "learning_rate": 5.543400713436385e-06, + "loss": 0.3059, + "step": 1880 + }, + { + "epoch": 4.468629200118942, + "grad_norm": 1.1711965799331665, + "learning_rate": 5.5410225921522e-06, + "loss": 0.4221, + "step": 1881 + }, + { + "epoch": 4.471008028545941, + "grad_norm": 1.0895520448684692, + "learning_rate": 5.538644470868014e-06, + "loss": 0.408, + "step": 1882 + }, + { + "epoch": 4.473386856972941, + "grad_norm": 1.2900350093841553, + "learning_rate": 5.536266349583829e-06, + "loss": 0.3046, + "step": 1883 + }, + { + "epoch": 4.47576568539994, + "grad_norm": 1.292560338973999, + "learning_rate": 5.533888228299644e-06, + "loss": 0.4203, + "step": 1884 + }, + { + "epoch": 4.4781445138269405, + "grad_norm": 1.1621716022491455, + "learning_rate": 5.531510107015459e-06, + "loss": 0.3662, + "step": 1885 + }, + { + "epoch": 4.48052334225394, + "grad_norm": 1.2611571550369263, + "learning_rate": 5.529131985731273e-06, + "loss": 0.3385, + "step": 1886 + }, + { + "epoch": 4.48290217068094, + "grad_norm": 1.1413981914520264, + "learning_rate": 5.526753864447087e-06, + "loss": 0.3767, + "step": 1887 + }, + { + "epoch": 4.485280999107939, + "grad_norm": 1.114806056022644, + "learning_rate": 5.524375743162902e-06, + "loss": 0.3452, + "step": 1888 + }, + { + "epoch": 4.487659827534939, + "grad_norm": 1.131904125213623, + "learning_rate": 5.521997621878717e-06, + "loss": 0.3438, + "step": 1889 + }, + { + "epoch": 4.490038655961939, + "grad_norm": 1.1222776174545288, + "learning_rate": 5.51961950059453e-06, + "loss": 0.339, + "step": 1890 + }, + { + "epoch": 4.492417484388938, + "grad_norm": 1.3343212604522705, + "learning_rate": 5.517241379310345e-06, + "loss": 0.4169, + "step": 1891 + }, + { + "epoch": 4.4947963128159385, + "grad_norm": 1.2191845178604126, + "learning_rate": 5.51486325802616e-06, + "loss": 0.4758, + "step": 1892 + }, + { + "epoch": 4.497175141242938, + "grad_norm": 1.1618508100509644, + "learning_rate": 5.512485136741974e-06, + "loss": 0.3479, + "step": 1893 + }, + { + "epoch": 4.499553969669938, + "grad_norm": 1.351635217666626, + "learning_rate": 5.510107015457789e-06, + "loss": 0.4407, + "step": 1894 + }, + { + "epoch": 4.501932798096937, + "grad_norm": 1.0888478755950928, + "learning_rate": 5.507728894173603e-06, + "loss": 0.3227, + "step": 1895 + }, + { + "epoch": 4.504311626523937, + "grad_norm": 1.1620545387268066, + "learning_rate": 5.505350772889417e-06, + "loss": 0.3166, + "step": 1896 + }, + { + "epoch": 4.506690454950936, + "grad_norm": 1.3431347608566284, + "learning_rate": 5.502972651605233e-06, + "loss": 0.3393, + "step": 1897 + }, + { + "epoch": 4.509069283377936, + "grad_norm": 1.137481689453125, + "learning_rate": 5.500594530321047e-06, + "loss": 0.378, + "step": 1898 + }, + { + "epoch": 4.5114481118049365, + "grad_norm": 1.1959136724472046, + "learning_rate": 5.498216409036862e-06, + "loss": 0.4205, + "step": 1899 + }, + { + "epoch": 4.513826940231936, + "grad_norm": 1.1828105449676514, + "learning_rate": 5.4958382877526765e-06, + "loss": 0.3705, + "step": 1900 + }, + { + "epoch": 4.513826940231936, + "eval_loss": 0.42881694436073303, + "eval_runtime": 23.3018, + "eval_samples_per_second": 32.1, + "eval_steps_per_second": 16.05, + "step": 1900 + }, + { + "epoch": 4.516205768658936, + "grad_norm": 1.1837739944458008, + "learning_rate": 5.49346016646849e-06, + "loss": 0.3865, + "step": 1901 + }, + { + "epoch": 4.518584597085935, + "grad_norm": 1.206076979637146, + "learning_rate": 5.491082045184305e-06, + "loss": 0.3483, + "step": 1902 + }, + { + "epoch": 4.520963425512935, + "grad_norm": 1.0585206747055054, + "learning_rate": 5.4887039239001195e-06, + "loss": 0.3646, + "step": 1903 + }, + { + "epoch": 4.523342253939934, + "grad_norm": 1.2180304527282715, + "learning_rate": 5.486325802615934e-06, + "loss": 0.3452, + "step": 1904 + }, + { + "epoch": 4.5257210823669345, + "grad_norm": 1.1683738231658936, + "learning_rate": 5.483947681331748e-06, + "loss": 0.3298, + "step": 1905 + }, + { + "epoch": 4.528099910793934, + "grad_norm": 1.1180732250213623, + "learning_rate": 5.4815695600475625e-06, + "loss": 0.2824, + "step": 1906 + }, + { + "epoch": 4.530478739220934, + "grad_norm": 1.1328343152999878, + "learning_rate": 5.479191438763377e-06, + "loss": 0.3916, + "step": 1907 + }, + { + "epoch": 4.532857567647933, + "grad_norm": 1.2536476850509644, + "learning_rate": 5.476813317479192e-06, + "loss": 0.3833, + "step": 1908 + }, + { + "epoch": 4.535236396074933, + "grad_norm": 1.262144923210144, + "learning_rate": 5.474435196195006e-06, + "loss": 0.4268, + "step": 1909 + }, + { + "epoch": 4.537615224501932, + "grad_norm": 1.0134221315383911, + "learning_rate": 5.47205707491082e-06, + "loss": 0.3419, + "step": 1910 + }, + { + "epoch": 4.539994052928932, + "grad_norm": 1.2848020792007446, + "learning_rate": 5.4696789536266355e-06, + "loss": 0.4045, + "step": 1911 + }, + { + "epoch": 4.5423728813559325, + "grad_norm": 1.2121903896331787, + "learning_rate": 5.46730083234245e-06, + "loss": 0.3606, + "step": 1912 + }, + { + "epoch": 4.544751709782932, + "grad_norm": 1.2171224355697632, + "learning_rate": 5.464922711058265e-06, + "loss": 0.3743, + "step": 1913 + }, + { + "epoch": 4.547130538209932, + "grad_norm": 1.1701767444610596, + "learning_rate": 5.462544589774079e-06, + "loss": 0.3272, + "step": 1914 + }, + { + "epoch": 4.549509366636931, + "grad_norm": 1.0890527963638306, + "learning_rate": 5.460166468489894e-06, + "loss": 0.4191, + "step": 1915 + }, + { + "epoch": 4.551888195063931, + "grad_norm": 1.3398360013961792, + "learning_rate": 5.457788347205708e-06, + "loss": 0.38, + "step": 1916 + }, + { + "epoch": 4.55426702349093, + "grad_norm": 1.1327080726623535, + "learning_rate": 5.455410225921522e-06, + "loss": 0.3597, + "step": 1917 + }, + { + "epoch": 4.5566458519179305, + "grad_norm": 1.012004017829895, + "learning_rate": 5.453032104637337e-06, + "loss": 0.36, + "step": 1918 + }, + { + "epoch": 4.55902468034493, + "grad_norm": 1.2158269882202148, + "learning_rate": 5.4506539833531516e-06, + "loss": 0.3724, + "step": 1919 + }, + { + "epoch": 4.56140350877193, + "grad_norm": 1.2282764911651611, + "learning_rate": 5.448275862068966e-06, + "loss": 0.4459, + "step": 1920 + }, + { + "epoch": 4.56378233719893, + "grad_norm": 1.2691152095794678, + "learning_rate": 5.44589774078478e-06, + "loss": 0.416, + "step": 1921 + }, + { + "epoch": 4.566161165625929, + "grad_norm": 1.1523921489715576, + "learning_rate": 5.4435196195005945e-06, + "loss": 0.3587, + "step": 1922 + }, + { + "epoch": 4.568539994052929, + "grad_norm": 1.2529860734939575, + "learning_rate": 5.441141498216409e-06, + "loss": 0.326, + "step": 1923 + }, + { + "epoch": 4.570918822479928, + "grad_norm": 1.1370042562484741, + "learning_rate": 5.438763376932224e-06, + "loss": 0.4051, + "step": 1924 + }, + { + "epoch": 4.5732976509069285, + "grad_norm": 1.1958023309707642, + "learning_rate": 5.436385255648039e-06, + "loss": 0.3534, + "step": 1925 + }, + { + "epoch": 4.575676479333928, + "grad_norm": 1.3130226135253906, + "learning_rate": 5.434007134363854e-06, + "loss": 0.4201, + "step": 1926 + }, + { + "epoch": 4.578055307760928, + "grad_norm": 1.25039541721344, + "learning_rate": 5.431629013079668e-06, + "loss": 0.4177, + "step": 1927 + }, + { + "epoch": 4.580434136187927, + "grad_norm": 1.2225428819656372, + "learning_rate": 5.429250891795482e-06, + "loss": 0.4031, + "step": 1928 + }, + { + "epoch": 4.582812964614927, + "grad_norm": 1.2176666259765625, + "learning_rate": 5.426872770511297e-06, + "loss": 0.3476, + "step": 1929 + }, + { + "epoch": 4.585191793041927, + "grad_norm": 1.3239904642105103, + "learning_rate": 5.424494649227111e-06, + "loss": 0.3503, + "step": 1930 + }, + { + "epoch": 4.5875706214689265, + "grad_norm": 1.2813798189163208, + "learning_rate": 5.422116527942925e-06, + "loss": 0.3744, + "step": 1931 + }, + { + "epoch": 4.589949449895927, + "grad_norm": 1.1596063375473022, + "learning_rate": 5.41973840665874e-06, + "loss": 0.387, + "step": 1932 + }, + { + "epoch": 4.592328278322926, + "grad_norm": 1.1604424715042114, + "learning_rate": 5.417360285374554e-06, + "loss": 0.3134, + "step": 1933 + }, + { + "epoch": 4.594707106749926, + "grad_norm": 1.335079550743103, + "learning_rate": 5.414982164090369e-06, + "loss": 0.4324, + "step": 1934 + }, + { + "epoch": 4.597085935176925, + "grad_norm": 1.343200445175171, + "learning_rate": 5.412604042806184e-06, + "loss": 0.3834, + "step": 1935 + }, + { + "epoch": 4.599464763603925, + "grad_norm": 1.1376577615737915, + "learning_rate": 5.410225921521997e-06, + "loss": 0.3901, + "step": 1936 + }, + { + "epoch": 4.601843592030924, + "grad_norm": 1.156501054763794, + "learning_rate": 5.407847800237812e-06, + "loss": 0.3935, + "step": 1937 + }, + { + "epoch": 4.6042224204579245, + "grad_norm": 1.1844979524612427, + "learning_rate": 5.4054696789536275e-06, + "loss": 0.3852, + "step": 1938 + }, + { + "epoch": 4.606601248884925, + "grad_norm": 1.1028953790664673, + "learning_rate": 5.403091557669442e-06, + "loss": 0.4185, + "step": 1939 + }, + { + "epoch": 4.608980077311924, + "grad_norm": 1.2764723300933838, + "learning_rate": 5.400713436385257e-06, + "loss": 0.4532, + "step": 1940 + }, + { + "epoch": 4.611358905738924, + "grad_norm": 1.079726219177246, + "learning_rate": 5.398335315101071e-06, + "loss": 0.3504, + "step": 1941 + }, + { + "epoch": 4.613737734165923, + "grad_norm": 1.1727697849273682, + "learning_rate": 5.395957193816885e-06, + "loss": 0.3545, + "step": 1942 + }, + { + "epoch": 4.616116562592923, + "grad_norm": 1.2976915836334229, + "learning_rate": 5.3935790725327e-06, + "loss": 0.3884, + "step": 1943 + }, + { + "epoch": 4.6184953910199225, + "grad_norm": 1.192341685295105, + "learning_rate": 5.391200951248514e-06, + "loss": 0.3745, + "step": 1944 + }, + { + "epoch": 4.620874219446923, + "grad_norm": 1.1571686267852783, + "learning_rate": 5.388822829964329e-06, + "loss": 0.4193, + "step": 1945 + }, + { + "epoch": 4.623253047873922, + "grad_norm": 1.1785831451416016, + "learning_rate": 5.386444708680143e-06, + "loss": 0.3941, + "step": 1946 + }, + { + "epoch": 4.625631876300922, + "grad_norm": 1.1980559825897217, + "learning_rate": 5.384066587395957e-06, + "loss": 0.4364, + "step": 1947 + }, + { + "epoch": 4.628010704727922, + "grad_norm": 1.1314550638198853, + "learning_rate": 5.381688466111772e-06, + "loss": 0.4138, + "step": 1948 + }, + { + "epoch": 4.630389533154921, + "grad_norm": 1.3495122194290161, + "learning_rate": 5.3793103448275865e-06, + "loss": 0.3704, + "step": 1949 + }, + { + "epoch": 4.632768361581921, + "grad_norm": 1.2660086154937744, + "learning_rate": 5.376932223543401e-06, + "loss": 0.4398, + "step": 1950 + }, + { + "epoch": 4.632768361581921, + "eval_loss": 0.4279799163341522, + "eval_runtime": 23.3725, + "eval_samples_per_second": 32.003, + "eval_steps_per_second": 16.002, + "step": 1950 + }, + { + "epoch": 4.6351471900089205, + "grad_norm": 1.1495308876037598, + "learning_rate": 5.374554102259215e-06, + "loss": 0.3123, + "step": 1951 + }, + { + "epoch": 4.637526018435921, + "grad_norm": 1.1677197217941284, + "learning_rate": 5.37217598097503e-06, + "loss": 0.3281, + "step": 1952 + }, + { + "epoch": 4.63990484686292, + "grad_norm": 1.1564432382583618, + "learning_rate": 5.369797859690845e-06, + "loss": 0.3855, + "step": 1953 + }, + { + "epoch": 4.64228367528992, + "grad_norm": 1.0065945386886597, + "learning_rate": 5.3674197384066595e-06, + "loss": 0.3955, + "step": 1954 + }, + { + "epoch": 4.644662503716919, + "grad_norm": 1.3421404361724854, + "learning_rate": 5.365041617122474e-06, + "loss": 0.3318, + "step": 1955 + }, + { + "epoch": 4.647041332143919, + "grad_norm": 1.3160626888275146, + "learning_rate": 5.362663495838289e-06, + "loss": 0.3986, + "step": 1956 + }, + { + "epoch": 4.6494201605709184, + "grad_norm": 1.1863969564437866, + "learning_rate": 5.3602853745541025e-06, + "loss": 0.3765, + "step": 1957 + }, + { + "epoch": 4.6517989889979185, + "grad_norm": 1.257302165031433, + "learning_rate": 5.357907253269917e-06, + "loss": 0.4213, + "step": 1958 + }, + { + "epoch": 4.654177817424918, + "grad_norm": 1.2928720712661743, + "learning_rate": 5.355529131985732e-06, + "loss": 0.3859, + "step": 1959 + }, + { + "epoch": 4.656556645851918, + "grad_norm": 1.3593398332595825, + "learning_rate": 5.353151010701546e-06, + "loss": 0.457, + "step": 1960 + }, + { + "epoch": 4.658935474278918, + "grad_norm": 1.0612069368362427, + "learning_rate": 5.35077288941736e-06, + "loss": 0.3509, + "step": 1961 + }, + { + "epoch": 4.661314302705917, + "grad_norm": 1.1116403341293335, + "learning_rate": 5.348394768133175e-06, + "loss": 0.4286, + "step": 1962 + }, + { + "epoch": 4.663693131132917, + "grad_norm": 1.1393296718597412, + "learning_rate": 5.346016646848989e-06, + "loss": 0.3716, + "step": 1963 + }, + { + "epoch": 4.6660719595599165, + "grad_norm": 1.0125194787979126, + "learning_rate": 5.343638525564804e-06, + "loss": 0.3524, + "step": 1964 + }, + { + "epoch": 4.668450787986917, + "grad_norm": 1.2206250429153442, + "learning_rate": 5.3412604042806186e-06, + "loss": 0.4338, + "step": 1965 + }, + { + "epoch": 4.670829616413916, + "grad_norm": 0.9432029128074646, + "learning_rate": 5.338882282996434e-06, + "loss": 0.3604, + "step": 1966 + }, + { + "epoch": 4.673208444840916, + "grad_norm": 1.2455750703811646, + "learning_rate": 5.336504161712248e-06, + "loss": 0.3525, + "step": 1967 + }, + { + "epoch": 4.675587273267915, + "grad_norm": 1.0410239696502686, + "learning_rate": 5.334126040428062e-06, + "loss": 0.3695, + "step": 1968 + }, + { + "epoch": 4.677966101694915, + "grad_norm": 1.1563447713851929, + "learning_rate": 5.331747919143877e-06, + "loss": 0.3545, + "step": 1969 + }, + { + "epoch": 4.680344930121915, + "grad_norm": 1.1289013624191284, + "learning_rate": 5.329369797859692e-06, + "loss": 0.3888, + "step": 1970 + }, + { + "epoch": 4.6827237585489145, + "grad_norm": 1.226755142211914, + "learning_rate": 5.326991676575506e-06, + "loss": 0.3295, + "step": 1971 + }, + { + "epoch": 4.685102586975915, + "grad_norm": 1.2362174987792969, + "learning_rate": 5.32461355529132e-06, + "loss": 0.3989, + "step": 1972 + }, + { + "epoch": 4.687481415402914, + "grad_norm": 1.068682074546814, + "learning_rate": 5.322235434007135e-06, + "loss": 0.3491, + "step": 1973 + }, + { + "epoch": 4.689860243829914, + "grad_norm": 1.272399663925171, + "learning_rate": 5.319857312722949e-06, + "loss": 0.4657, + "step": 1974 + }, + { + "epoch": 4.692239072256913, + "grad_norm": 1.1723976135253906, + "learning_rate": 5.317479191438764e-06, + "loss": 0.3563, + "step": 1975 + }, + { + "epoch": 4.694617900683913, + "grad_norm": 1.1083317995071411, + "learning_rate": 5.315101070154578e-06, + "loss": 0.3995, + "step": 1976 + }, + { + "epoch": 4.6969967291109125, + "grad_norm": 1.0272246599197388, + "learning_rate": 5.312722948870392e-06, + "loss": 0.3798, + "step": 1977 + }, + { + "epoch": 4.699375557537913, + "grad_norm": 1.112699031829834, + "learning_rate": 5.310344827586207e-06, + "loss": 0.3262, + "step": 1978 + }, + { + "epoch": 4.701754385964913, + "grad_norm": 1.2425813674926758, + "learning_rate": 5.307966706302022e-06, + "loss": 0.509, + "step": 1979 + }, + { + "epoch": 4.704133214391912, + "grad_norm": 1.1039142608642578, + "learning_rate": 5.305588585017837e-06, + "loss": 0.323, + "step": 1980 + }, + { + "epoch": 4.706512042818912, + "grad_norm": 1.3130589723587036, + "learning_rate": 5.3032104637336515e-06, + "loss": 0.4327, + "step": 1981 + }, + { + "epoch": 4.708890871245911, + "grad_norm": 1.0438907146453857, + "learning_rate": 5.300832342449465e-06, + "loss": 0.3465, + "step": 1982 + }, + { + "epoch": 4.711269699672911, + "grad_norm": 1.2099350690841675, + "learning_rate": 5.29845422116528e-06, + "loss": 0.4337, + "step": 1983 + }, + { + "epoch": 4.7136485280999105, + "grad_norm": 0.9775784611701965, + "learning_rate": 5.2960760998810945e-06, + "loss": 0.399, + "step": 1984 + }, + { + "epoch": 4.716027356526911, + "grad_norm": 1.3561195135116577, + "learning_rate": 5.293697978596909e-06, + "loss": 0.4204, + "step": 1985 + }, + { + "epoch": 4.71840618495391, + "grad_norm": 1.1045551300048828, + "learning_rate": 5.291319857312724e-06, + "loss": 0.2777, + "step": 1986 + }, + { + "epoch": 4.72078501338091, + "grad_norm": 1.0119092464447021, + "learning_rate": 5.2889417360285375e-06, + "loss": 0.3122, + "step": 1987 + }, + { + "epoch": 4.72316384180791, + "grad_norm": 1.2413631677627563, + "learning_rate": 5.286563614744352e-06, + "loss": 0.3954, + "step": 1988 + }, + { + "epoch": 4.725542670234909, + "grad_norm": 1.2027863264083862, + "learning_rate": 5.284185493460167e-06, + "loss": 0.4196, + "step": 1989 + }, + { + "epoch": 4.727921498661909, + "grad_norm": 1.1011801958084106, + "learning_rate": 5.281807372175981e-06, + "loss": 0.3901, + "step": 1990 + }, + { + "epoch": 4.730300327088909, + "grad_norm": 1.2521893978118896, + "learning_rate": 5.279429250891795e-06, + "loss": 0.3536, + "step": 1991 + }, + { + "epoch": 4.732679155515909, + "grad_norm": 1.1839587688446045, + "learning_rate": 5.27705112960761e-06, + "loss": 0.3747, + "step": 1992 + }, + { + "epoch": 4.735057983942908, + "grad_norm": 1.221303939819336, + "learning_rate": 5.274673008323425e-06, + "loss": 0.3998, + "step": 1993 + }, + { + "epoch": 4.737436812369908, + "grad_norm": 1.1418098211288452, + "learning_rate": 5.27229488703924e-06, + "loss": 0.3377, + "step": 1994 + }, + { + "epoch": 4.739815640796907, + "grad_norm": 1.098084568977356, + "learning_rate": 5.269916765755054e-06, + "loss": 0.3609, + "step": 1995 + }, + { + "epoch": 4.742194469223907, + "grad_norm": 1.2663143873214722, + "learning_rate": 5.267538644470869e-06, + "loss": 0.4367, + "step": 1996 + }, + { + "epoch": 4.744573297650907, + "grad_norm": 1.1939369440078735, + "learning_rate": 5.265160523186683e-06, + "loss": 0.3622, + "step": 1997 + }, + { + "epoch": 4.746952126077907, + "grad_norm": 1.0951242446899414, + "learning_rate": 5.262782401902497e-06, + "loss": 0.3603, + "step": 1998 + }, + { + "epoch": 4.749330954504907, + "grad_norm": 0.9754849076271057, + "learning_rate": 5.260404280618312e-06, + "loss": 0.3063, + "step": 1999 + }, + { + "epoch": 4.751709782931906, + "grad_norm": 1.230715274810791, + "learning_rate": 5.2580261593341266e-06, + "loss": 0.3117, + "step": 2000 + }, + { + "epoch": 4.751709782931906, + "eval_loss": 0.42703258991241455, + "eval_runtime": 23.1729, + "eval_samples_per_second": 32.279, + "eval_steps_per_second": 16.14, + "step": 2000 + }, + { + "epoch": 4.754088611358906, + "grad_norm": 1.1506050825119019, + "learning_rate": 5.255648038049941e-06, + "loss": 0.3233, + "step": 2001 + }, + { + "epoch": 4.756467439785905, + "grad_norm": 1.1924481391906738, + "learning_rate": 5.253269916765755e-06, + "loss": 0.4156, + "step": 2002 + }, + { + "epoch": 4.758846268212905, + "grad_norm": 1.170750379562378, + "learning_rate": 5.2508917954815695e-06, + "loss": 0.4477, + "step": 2003 + }, + { + "epoch": 4.761225096639905, + "grad_norm": 1.3022801876068115, + "learning_rate": 5.248513674197384e-06, + "loss": 0.3878, + "step": 2004 + }, + { + "epoch": 4.763603925066905, + "grad_norm": 1.4578815698623657, + "learning_rate": 5.246135552913199e-06, + "loss": 0.4981, + "step": 2005 + }, + { + "epoch": 4.765982753493904, + "grad_norm": 1.1409708261489868, + "learning_rate": 5.243757431629013e-06, + "loss": 0.3786, + "step": 2006 + }, + { + "epoch": 4.768361581920904, + "grad_norm": 1.2667869329452515, + "learning_rate": 5.241379310344829e-06, + "loss": 0.4225, + "step": 2007 + }, + { + "epoch": 4.770740410347903, + "grad_norm": 1.0564688444137573, + "learning_rate": 5.239001189060643e-06, + "loss": 0.3203, + "step": 2008 + }, + { + "epoch": 4.773119238774903, + "grad_norm": 1.1854530572891235, + "learning_rate": 5.236623067776457e-06, + "loss": 0.4128, + "step": 2009 + }, + { + "epoch": 4.775498067201903, + "grad_norm": 1.2330033779144287, + "learning_rate": 5.234244946492272e-06, + "loss": 0.381, + "step": 2010 + }, + { + "epoch": 4.777876895628903, + "grad_norm": 1.228974461555481, + "learning_rate": 5.2318668252080864e-06, + "loss": 0.3625, + "step": 2011 + }, + { + "epoch": 4.780255724055903, + "grad_norm": 1.278163194656372, + "learning_rate": 5.2294887039239e-06, + "loss": 0.3763, + "step": 2012 + }, + { + "epoch": 4.782634552482902, + "grad_norm": 1.2909750938415527, + "learning_rate": 5.227110582639715e-06, + "loss": 0.3758, + "step": 2013 + }, + { + "epoch": 4.785013380909902, + "grad_norm": 1.1436619758605957, + "learning_rate": 5.224732461355529e-06, + "loss": 0.3895, + "step": 2014 + }, + { + "epoch": 4.787392209336901, + "grad_norm": 1.1258983612060547, + "learning_rate": 5.222354340071344e-06, + "loss": 0.346, + "step": 2015 + }, + { + "epoch": 4.789771037763901, + "grad_norm": 0.987506628036499, + "learning_rate": 5.219976218787159e-06, + "loss": 0.3327, + "step": 2016 + }, + { + "epoch": 4.7921498661909006, + "grad_norm": 1.101040005683899, + "learning_rate": 5.217598097502972e-06, + "loss": 0.3208, + "step": 2017 + }, + { + "epoch": 4.794528694617901, + "grad_norm": 1.1363823413848877, + "learning_rate": 5.215219976218787e-06, + "loss": 0.3233, + "step": 2018 + }, + { + "epoch": 4.796907523044901, + "grad_norm": 1.1690577268600464, + "learning_rate": 5.212841854934602e-06, + "loss": 0.3658, + "step": 2019 + }, + { + "epoch": 4.7992863514719, + "grad_norm": 1.2604283094406128, + "learning_rate": 5.210463733650417e-06, + "loss": 0.3751, + "step": 2020 + }, + { + "epoch": 4.8016651798989, + "grad_norm": 1.3182450532913208, + "learning_rate": 5.208085612366232e-06, + "loss": 0.4059, + "step": 2021 + }, + { + "epoch": 4.804044008325899, + "grad_norm": 1.1195452213287354, + "learning_rate": 5.205707491082046e-06, + "loss": 0.3525, + "step": 2022 + }, + { + "epoch": 4.806422836752899, + "grad_norm": 1.0333186388015747, + "learning_rate": 5.20332936979786e-06, + "loss": 0.3613, + "step": 2023 + }, + { + "epoch": 4.808801665179899, + "grad_norm": 1.203368067741394, + "learning_rate": 5.200951248513675e-06, + "loss": 0.381, + "step": 2024 + }, + { + "epoch": 4.811180493606899, + "grad_norm": 1.2064144611358643, + "learning_rate": 5.198573127229489e-06, + "loss": 0.3948, + "step": 2025 + }, + { + "epoch": 4.813559322033898, + "grad_norm": 1.2426600456237793, + "learning_rate": 5.196195005945304e-06, + "loss": 0.4072, + "step": 2026 + }, + { + "epoch": 4.815938150460898, + "grad_norm": 1.1202892065048218, + "learning_rate": 5.1938168846611185e-06, + "loss": 0.3252, + "step": 2027 + }, + { + "epoch": 4.818316978887898, + "grad_norm": 1.179026484489441, + "learning_rate": 5.191438763376932e-06, + "loss": 0.3761, + "step": 2028 + }, + { + "epoch": 4.820695807314897, + "grad_norm": 1.1520802974700928, + "learning_rate": 5.189060642092747e-06, + "loss": 0.3794, + "step": 2029 + }, + { + "epoch": 4.823074635741897, + "grad_norm": 1.109853982925415, + "learning_rate": 5.1866825208085615e-06, + "loss": 0.3424, + "step": 2030 + }, + { + "epoch": 4.825453464168897, + "grad_norm": 1.2638977766036987, + "learning_rate": 5.184304399524376e-06, + "loss": 0.4054, + "step": 2031 + }, + { + "epoch": 4.827832292595897, + "grad_norm": 1.094199299812317, + "learning_rate": 5.18192627824019e-06, + "loss": 0.2876, + "step": 2032 + }, + { + "epoch": 4.830211121022896, + "grad_norm": 1.2783927917480469, + "learning_rate": 5.1795481569560045e-06, + "loss": 0.4206, + "step": 2033 + }, + { + "epoch": 4.832589949449896, + "grad_norm": 1.2296520471572876, + "learning_rate": 5.17717003567182e-06, + "loss": 0.3732, + "step": 2034 + }, + { + "epoch": 4.834968777876895, + "grad_norm": 1.2422195672988892, + "learning_rate": 5.1747919143876346e-06, + "loss": 0.3648, + "step": 2035 + }, + { + "epoch": 4.837347606303895, + "grad_norm": 1.217808485031128, + "learning_rate": 5.172413793103449e-06, + "loss": 0.3553, + "step": 2036 + }, + { + "epoch": 4.8397264347308955, + "grad_norm": 1.147058367729187, + "learning_rate": 5.170035671819264e-06, + "loss": 0.3908, + "step": 2037 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 1.2931337356567383, + "learning_rate": 5.1676575505350775e-06, + "loss": 0.4268, + "step": 2038 + }, + { + "epoch": 4.844484091584895, + "grad_norm": 1.1515430212020874, + "learning_rate": 5.165279429250892e-06, + "loss": 0.4234, + "step": 2039 + }, + { + "epoch": 4.846862920011894, + "grad_norm": 1.1211035251617432, + "learning_rate": 5.162901307966707e-06, + "loss": 0.3802, + "step": 2040 + }, + { + "epoch": 4.849241748438894, + "grad_norm": 1.2644450664520264, + "learning_rate": 5.160523186682521e-06, + "loss": 0.44, + "step": 2041 + }, + { + "epoch": 4.851620576865893, + "grad_norm": 1.2001146078109741, + "learning_rate": 5.158145065398336e-06, + "loss": 0.3624, + "step": 2042 + }, + { + "epoch": 4.853999405292893, + "grad_norm": 1.4030879735946655, + "learning_rate": 5.15576694411415e-06, + "loss": 0.4146, + "step": 2043 + }, + { + "epoch": 4.856378233719893, + "grad_norm": 1.134551763534546, + "learning_rate": 5.153388822829964e-06, + "loss": 0.3362, + "step": 2044 + }, + { + "epoch": 4.858757062146893, + "grad_norm": 1.1568751335144043, + "learning_rate": 5.151010701545779e-06, + "loss": 0.4206, + "step": 2045 + }, + { + "epoch": 4.861135890573893, + "grad_norm": 1.2107574939727783, + "learning_rate": 5.148632580261594e-06, + "loss": 0.4075, + "step": 2046 + }, + { + "epoch": 4.863514719000892, + "grad_norm": 1.237999677658081, + "learning_rate": 5.146254458977407e-06, + "loss": 0.3352, + "step": 2047 + }, + { + "epoch": 4.865893547427892, + "grad_norm": 1.2913504838943481, + "learning_rate": 5.143876337693224e-06, + "loss": 0.3334, + "step": 2048 + }, + { + "epoch": 4.868272375854891, + "grad_norm": 1.3780343532562256, + "learning_rate": 5.141498216409037e-06, + "loss": 0.3567, + "step": 2049 + }, + { + "epoch": 4.8706512042818915, + "grad_norm": 1.061675786972046, + "learning_rate": 5.139120095124852e-06, + "loss": 0.3221, + "step": 2050 + }, + { + "epoch": 4.8706512042818915, + "eval_loss": 0.42611539363861084, + "eval_runtime": 23.1405, + "eval_samples_per_second": 32.324, + "eval_steps_per_second": 16.162, + "step": 2050 + }, + { + "epoch": 4.873030032708891, + "grad_norm": 1.1853545904159546, + "learning_rate": 5.136741973840667e-06, + "loss": 0.351, + "step": 2051 + }, + { + "epoch": 4.875408861135891, + "grad_norm": 1.3596481084823608, + "learning_rate": 5.134363852556481e-06, + "loss": 0.3713, + "step": 2052 + }, + { + "epoch": 4.87778768956289, + "grad_norm": 1.171947717666626, + "learning_rate": 5.131985731272295e-06, + "loss": 0.3791, + "step": 2053 + }, + { + "epoch": 4.88016651798989, + "grad_norm": 1.0020349025726318, + "learning_rate": 5.12960760998811e-06, + "loss": 0.3066, + "step": 2054 + }, + { + "epoch": 4.882545346416889, + "grad_norm": 1.1247196197509766, + "learning_rate": 5.127229488703924e-06, + "loss": 0.3619, + "step": 2055 + }, + { + "epoch": 4.884924174843889, + "grad_norm": 1.3604203462600708, + "learning_rate": 5.124851367419739e-06, + "loss": 0.3151, + "step": 2056 + }, + { + "epoch": 4.887303003270889, + "grad_norm": 1.225115180015564, + "learning_rate": 5.1224732461355535e-06, + "loss": 0.4074, + "step": 2057 + }, + { + "epoch": 4.889681831697889, + "grad_norm": 1.0569473505020142, + "learning_rate": 5.120095124851367e-06, + "loss": 0.3802, + "step": 2058 + }, + { + "epoch": 4.892060660124889, + "grad_norm": 1.388362169265747, + "learning_rate": 5.117717003567182e-06, + "loss": 0.3987, + "step": 2059 + }, + { + "epoch": 4.894439488551888, + "grad_norm": 1.2732670307159424, + "learning_rate": 5.1153388822829964e-06, + "loss": 0.4372, + "step": 2060 + }, + { + "epoch": 4.896818316978888, + "grad_norm": 1.166775107383728, + "learning_rate": 5.112960760998812e-06, + "loss": 0.3481, + "step": 2061 + }, + { + "epoch": 4.899197145405887, + "grad_norm": 1.0907658338546753, + "learning_rate": 5.1105826397146265e-06, + "loss": 0.3838, + "step": 2062 + }, + { + "epoch": 4.9015759738328875, + "grad_norm": 1.4052003622055054, + "learning_rate": 5.108204518430441e-06, + "loss": 0.422, + "step": 2063 + }, + { + "epoch": 4.903954802259887, + "grad_norm": 1.0714243650436401, + "learning_rate": 5.105826397146255e-06, + "loss": 0.3922, + "step": 2064 + }, + { + "epoch": 4.906333630686887, + "grad_norm": 1.1573463678359985, + "learning_rate": 5.1034482758620695e-06, + "loss": 0.4149, + "step": 2065 + }, + { + "epoch": 4.908712459113886, + "grad_norm": 1.1445893049240112, + "learning_rate": 5.101070154577884e-06, + "loss": 0.3457, + "step": 2066 + }, + { + "epoch": 4.911091287540886, + "grad_norm": 1.164520263671875, + "learning_rate": 5.098692033293699e-06, + "loss": 0.3317, + "step": 2067 + }, + { + "epoch": 4.913470115967886, + "grad_norm": 1.1525871753692627, + "learning_rate": 5.0963139120095125e-06, + "loss": 0.3839, + "step": 2068 + }, + { + "epoch": 4.915848944394885, + "grad_norm": 1.2158613204956055, + "learning_rate": 5.093935790725327e-06, + "loss": 0.3894, + "step": 2069 + }, + { + "epoch": 4.9182277728218855, + "grad_norm": 1.3426412343978882, + "learning_rate": 5.091557669441142e-06, + "loss": 0.3993, + "step": 2070 + }, + { + "epoch": 4.920606601248885, + "grad_norm": 1.3264241218566895, + "learning_rate": 5.089179548156956e-06, + "loss": 0.4324, + "step": 2071 + }, + { + "epoch": 4.922985429675885, + "grad_norm": 1.389930009841919, + "learning_rate": 5.086801426872771e-06, + "loss": 0.3949, + "step": 2072 + }, + { + "epoch": 4.925364258102884, + "grad_norm": 1.2342838048934937, + "learning_rate": 5.084423305588585e-06, + "loss": 0.3847, + "step": 2073 + }, + { + "epoch": 4.927743086529884, + "grad_norm": 1.11894690990448, + "learning_rate": 5.082045184304399e-06, + "loss": 0.3562, + "step": 2074 + }, + { + "epoch": 4.930121914956883, + "grad_norm": 1.241254210472107, + "learning_rate": 5.079667063020215e-06, + "loss": 0.3742, + "step": 2075 + }, + { + "epoch": 4.9325007433838834, + "grad_norm": 1.3489124774932861, + "learning_rate": 5.077288941736029e-06, + "loss": 0.3976, + "step": 2076 + }, + { + "epoch": 4.9348795718108835, + "grad_norm": 1.0890111923217773, + "learning_rate": 5.074910820451844e-06, + "loss": 0.3699, + "step": 2077 + }, + { + "epoch": 4.937258400237883, + "grad_norm": 1.1254839897155762, + "learning_rate": 5.072532699167659e-06, + "loss": 0.3449, + "step": 2078 + }, + { + "epoch": 4.939637228664883, + "grad_norm": 1.2237194776535034, + "learning_rate": 5.070154577883472e-06, + "loss": 0.4357, + "step": 2079 + }, + { + "epoch": 4.942016057091882, + "grad_norm": 1.0360277891159058, + "learning_rate": 5.067776456599287e-06, + "loss": 0.3364, + "step": 2080 + }, + { + "epoch": 4.944394885518882, + "grad_norm": 1.2532778978347778, + "learning_rate": 5.0653983353151016e-06, + "loss": 0.3866, + "step": 2081 + }, + { + "epoch": 4.946773713945881, + "grad_norm": 1.0111490488052368, + "learning_rate": 5.063020214030916e-06, + "loss": 0.3519, + "step": 2082 + }, + { + "epoch": 4.9491525423728815, + "grad_norm": 1.2915304899215698, + "learning_rate": 5.06064209274673e-06, + "loss": 0.3729, + "step": 2083 + }, + { + "epoch": 4.951531370799881, + "grad_norm": 1.1338180303573608, + "learning_rate": 5.0582639714625446e-06, + "loss": 0.3803, + "step": 2084 + }, + { + "epoch": 4.953910199226881, + "grad_norm": 1.2871617078781128, + "learning_rate": 5.055885850178359e-06, + "loss": 0.4118, + "step": 2085 + }, + { + "epoch": 4.956289027653881, + "grad_norm": 1.2273420095443726, + "learning_rate": 5.053507728894174e-06, + "loss": 0.4023, + "step": 2086 + }, + { + "epoch": 4.95866785608088, + "grad_norm": 1.1978644132614136, + "learning_rate": 5.051129607609988e-06, + "loss": 0.3545, + "step": 2087 + }, + { + "epoch": 4.96104668450788, + "grad_norm": 1.0667328834533691, + "learning_rate": 5.048751486325802e-06, + "loss": 0.3621, + "step": 2088 + }, + { + "epoch": 4.963425512934879, + "grad_norm": 1.0845947265625, + "learning_rate": 5.046373365041618e-06, + "loss": 0.3775, + "step": 2089 + }, + { + "epoch": 4.9658043413618795, + "grad_norm": 1.3068907260894775, + "learning_rate": 5.043995243757432e-06, + "loss": 0.3881, + "step": 2090 + }, + { + "epoch": 4.968183169788879, + "grad_norm": 1.4938195943832397, + "learning_rate": 5.041617122473247e-06, + "loss": 0.4284, + "step": 2091 + }, + { + "epoch": 4.970561998215879, + "grad_norm": 1.2085968255996704, + "learning_rate": 5.0392390011890614e-06, + "loss": 0.3312, + "step": 2092 + }, + { + "epoch": 4.972940826642878, + "grad_norm": 1.2795571088790894, + "learning_rate": 5.036860879904876e-06, + "loss": 0.4101, + "step": 2093 + }, + { + "epoch": 4.975319655069878, + "grad_norm": 1.143107533454895, + "learning_rate": 5.03448275862069e-06, + "loss": 0.3706, + "step": 2094 + }, + { + "epoch": 4.977698483496878, + "grad_norm": 1.1284552812576294, + "learning_rate": 5.0321046373365044e-06, + "loss": 0.2925, + "step": 2095 + }, + { + "epoch": 4.9800773119238775, + "grad_norm": 1.214637279510498, + "learning_rate": 5.029726516052319e-06, + "loss": 0.3714, + "step": 2096 + }, + { + "epoch": 4.982456140350877, + "grad_norm": 1.202973484992981, + "learning_rate": 5.027348394768134e-06, + "loss": 0.3225, + "step": 2097 + }, + { + "epoch": 4.984834968777877, + "grad_norm": 1.333562970161438, + "learning_rate": 5.024970273483947e-06, + "loss": 0.3288, + "step": 2098 + }, + { + "epoch": 4.987213797204877, + "grad_norm": 1.1040313243865967, + "learning_rate": 5.022592152199762e-06, + "loss": 0.3343, + "step": 2099 + }, + { + "epoch": 4.989592625631876, + "grad_norm": 1.1800580024719238, + "learning_rate": 5.020214030915577e-06, + "loss": 0.3759, + "step": 2100 + }, + { + "epoch": 4.989592625631876, + "eval_loss": 0.4248233437538147, + "eval_runtime": 23.0769, + "eval_samples_per_second": 32.413, + "eval_steps_per_second": 16.207, + "step": 2100 + }, + { + "epoch": 4.991971454058876, + "grad_norm": 1.1622729301452637, + "learning_rate": 5.017835909631391e-06, + "loss": 0.3821, + "step": 2101 + }, + { + "epoch": 4.994350282485875, + "grad_norm": 1.4399574995040894, + "learning_rate": 5.015457788347207e-06, + "loss": 0.4184, + "step": 2102 + }, + { + "epoch": 4.9967291109128755, + "grad_norm": 1.220841407775879, + "learning_rate": 5.013079667063021e-06, + "loss": 0.3813, + "step": 2103 + }, + { + "epoch": 4.999107939339875, + "grad_norm": 1.0531835556030273, + "learning_rate": 5.010701545778835e-06, + "loss": 0.3473, + "step": 2104 + }, + { + "epoch": 5.0, + "grad_norm": 2.1977779865264893, + "learning_rate": 5.00832342449465e-06, + "loss": 0.3023, + "step": 2105 + }, + { + "epoch": 5.002378828427, + "grad_norm": 1.166244387626648, + "learning_rate": 5.005945303210464e-06, + "loss": 0.3565, + "step": 2106 + }, + { + "epoch": 5.004757656853999, + "grad_norm": 1.3517615795135498, + "learning_rate": 5.003567181926279e-06, + "loss": 0.4255, + "step": 2107 + }, + { + "epoch": 5.007136485280999, + "grad_norm": 1.1704368591308594, + "learning_rate": 5.0011890606420935e-06, + "loss": 0.3024, + "step": 2108 + }, + { + "epoch": 5.009515313707999, + "grad_norm": 1.214595913887024, + "learning_rate": 4.998810939357907e-06, + "loss": 0.3898, + "step": 2109 + }, + { + "epoch": 5.011894142134999, + "grad_norm": 1.127975344657898, + "learning_rate": 4.996432818073722e-06, + "loss": 0.3646, + "step": 2110 + }, + { + "epoch": 5.014272970561998, + "grad_norm": 1.1404362916946411, + "learning_rate": 4.9940546967895365e-06, + "loss": 0.376, + "step": 2111 + }, + { + "epoch": 5.016651798988998, + "grad_norm": 1.1638420820236206, + "learning_rate": 4.991676575505351e-06, + "loss": 0.3502, + "step": 2112 + }, + { + "epoch": 5.019030627415997, + "grad_norm": 1.3349223136901855, + "learning_rate": 4.989298454221166e-06, + "loss": 0.3697, + "step": 2113 + }, + { + "epoch": 5.021409455842997, + "grad_norm": 1.091774344444275, + "learning_rate": 4.98692033293698e-06, + "loss": 0.3202, + "step": 2114 + }, + { + "epoch": 5.0237882842699975, + "grad_norm": 1.2727184295654297, + "learning_rate": 4.984542211652795e-06, + "loss": 0.4137, + "step": 2115 + }, + { + "epoch": 5.026167112696997, + "grad_norm": 1.1745508909225464, + "learning_rate": 4.9821640903686096e-06, + "loss": 0.3484, + "step": 2116 + }, + { + "epoch": 5.028545941123997, + "grad_norm": 1.25146484375, + "learning_rate": 4.979785969084423e-06, + "loss": 0.3532, + "step": 2117 + }, + { + "epoch": 5.030924769550996, + "grad_norm": 1.3461670875549316, + "learning_rate": 4.977407847800238e-06, + "loss": 0.364, + "step": 2118 + }, + { + "epoch": 5.033303597977996, + "grad_norm": 1.2453292608261108, + "learning_rate": 4.975029726516053e-06, + "loss": 0.362, + "step": 2119 + }, + { + "epoch": 5.035682426404995, + "grad_norm": 1.0891677141189575, + "learning_rate": 4.972651605231867e-06, + "loss": 0.4055, + "step": 2120 + }, + { + "epoch": 5.038061254831995, + "grad_norm": 1.4016095399856567, + "learning_rate": 4.970273483947682e-06, + "loss": 0.4178, + "step": 2121 + }, + { + "epoch": 5.040440083258995, + "grad_norm": 1.3246444463729858, + "learning_rate": 4.967895362663496e-06, + "loss": 0.4008, + "step": 2122 + }, + { + "epoch": 5.042818911685995, + "grad_norm": 1.2041239738464355, + "learning_rate": 4.965517241379311e-06, + "loss": 0.3943, + "step": 2123 + }, + { + "epoch": 5.045197740112994, + "grad_norm": 1.3242827653884888, + "learning_rate": 4.963139120095125e-06, + "loss": 0.4367, + "step": 2124 + }, + { + "epoch": 5.047576568539994, + "grad_norm": 1.1359694004058838, + "learning_rate": 4.960760998810939e-06, + "loss": 0.3712, + "step": 2125 + }, + { + "epoch": 5.049955396966994, + "grad_norm": 1.3565722703933716, + "learning_rate": 4.958382877526755e-06, + "loss": 0.4281, + "step": 2126 + }, + { + "epoch": 5.052334225393993, + "grad_norm": 1.3466897010803223, + "learning_rate": 4.956004756242569e-06, + "loss": 0.4267, + "step": 2127 + }, + { + "epoch": 5.0547130538209935, + "grad_norm": 1.1781353950500488, + "learning_rate": 4.953626634958383e-06, + "loss": 0.3167, + "step": 2128 + }, + { + "epoch": 5.057091882247993, + "grad_norm": 1.2434474229812622, + "learning_rate": 4.951248513674198e-06, + "loss": 0.3544, + "step": 2129 + }, + { + "epoch": 5.059470710674993, + "grad_norm": 1.1215646266937256, + "learning_rate": 4.9488703923900124e-06, + "loss": 0.3482, + "step": 2130 + }, + { + "epoch": 5.061849539101992, + "grad_norm": 1.3371787071228027, + "learning_rate": 4.946492271105827e-06, + "loss": 0.3959, + "step": 2131 + }, + { + "epoch": 5.064228367528992, + "grad_norm": 1.3682892322540283, + "learning_rate": 4.944114149821641e-06, + "loss": 0.3789, + "step": 2132 + }, + { + "epoch": 5.066607195955991, + "grad_norm": 1.1645300388336182, + "learning_rate": 4.941736028537456e-06, + "loss": 0.3798, + "step": 2133 + }, + { + "epoch": 5.068986024382991, + "grad_norm": 1.0713883638381958, + "learning_rate": 4.939357907253271e-06, + "loss": 0.3868, + "step": 2134 + }, + { + "epoch": 5.0713648528099915, + "grad_norm": 1.028072714805603, + "learning_rate": 4.936979785969085e-06, + "loss": 0.3334, + "step": 2135 + }, + { + "epoch": 5.073743681236991, + "grad_norm": 1.073922038078308, + "learning_rate": 4.934601664684899e-06, + "loss": 0.3707, + "step": 2136 + }, + { + "epoch": 5.076122509663991, + "grad_norm": 1.2322797775268555, + "learning_rate": 4.932223543400714e-06, + "loss": 0.3801, + "step": 2137 + }, + { + "epoch": 5.07850133809099, + "grad_norm": 1.3131983280181885, + "learning_rate": 4.9298454221165285e-06, + "loss": 0.3582, + "step": 2138 + }, + { + "epoch": 5.08088016651799, + "grad_norm": 1.0657602548599243, + "learning_rate": 4.927467300832342e-06, + "loss": 0.3403, + "step": 2139 + }, + { + "epoch": 5.083258994944989, + "grad_norm": 1.1790955066680908, + "learning_rate": 4.925089179548158e-06, + "loss": 0.3468, + "step": 2140 + }, + { + "epoch": 5.085637823371989, + "grad_norm": 1.0671786069869995, + "learning_rate": 4.922711058263972e-06, + "loss": 0.3108, + "step": 2141 + }, + { + "epoch": 5.088016651798989, + "grad_norm": 1.0860204696655273, + "learning_rate": 4.920332936979786e-06, + "loss": 0.3582, + "step": 2142 + }, + { + "epoch": 5.090395480225989, + "grad_norm": 1.2334431409835815, + "learning_rate": 4.917954815695601e-06, + "loss": 0.3466, + "step": 2143 + }, + { + "epoch": 5.092774308652988, + "grad_norm": 1.13796865940094, + "learning_rate": 4.915576694411415e-06, + "loss": 0.3241, + "step": 2144 + }, + { + "epoch": 5.095153137079988, + "grad_norm": 1.0901583433151245, + "learning_rate": 4.91319857312723e-06, + "loss": 0.3745, + "step": 2145 + }, + { + "epoch": 5.097531965506988, + "grad_norm": 1.2266532182693481, + "learning_rate": 4.9108204518430445e-06, + "loss": 0.3733, + "step": 2146 + }, + { + "epoch": 5.099910793933987, + "grad_norm": 1.1073282957077026, + "learning_rate": 4.908442330558859e-06, + "loss": 0.3359, + "step": 2147 + }, + { + "epoch": 5.1022896223609875, + "grad_norm": 1.3475801944732666, + "learning_rate": 4.906064209274674e-06, + "loss": 0.3762, + "step": 2148 + }, + { + "epoch": 5.104668450787987, + "grad_norm": 1.4092116355895996, + "learning_rate": 4.903686087990488e-06, + "loss": 0.4263, + "step": 2149 + }, + { + "epoch": 5.107047279214987, + "grad_norm": 1.1807891130447388, + "learning_rate": 4.901307966706302e-06, + "loss": 0.3622, + "step": 2150 + }, + { + "epoch": 5.107047279214987, + "eval_loss": 0.4271685779094696, + "eval_runtime": 23.1906, + "eval_samples_per_second": 32.254, + "eval_steps_per_second": 16.127, + "step": 2150 + }, + { + "epoch": 5.109426107641986, + "grad_norm": 1.2000404596328735, + "learning_rate": 4.898929845422117e-06, + "loss": 0.325, + "step": 2151 + }, + { + "epoch": 5.111804936068986, + "grad_norm": 1.265765905380249, + "learning_rate": 4.896551724137931e-06, + "loss": 0.3684, + "step": 2152 + }, + { + "epoch": 5.114183764495985, + "grad_norm": 1.3624063730239868, + "learning_rate": 4.894173602853746e-06, + "loss": 0.4131, + "step": 2153 + }, + { + "epoch": 5.116562592922985, + "grad_norm": 1.2329858541488647, + "learning_rate": 4.8917954815695605e-06, + "loss": 0.3026, + "step": 2154 + }, + { + "epoch": 5.1189414213499855, + "grad_norm": 1.497024655342102, + "learning_rate": 4.889417360285375e-06, + "loss": 0.4137, + "step": 2155 + }, + { + "epoch": 5.121320249776985, + "grad_norm": 1.1989549398422241, + "learning_rate": 4.88703923900119e-06, + "loss": 0.4956, + "step": 2156 + }, + { + "epoch": 5.123699078203985, + "grad_norm": 1.1640437841415405, + "learning_rate": 4.8846611177170035e-06, + "loss": 0.3567, + "step": 2157 + }, + { + "epoch": 5.126077906630984, + "grad_norm": 1.0846333503723145, + "learning_rate": 4.882282996432818e-06, + "loss": 0.2972, + "step": 2158 + }, + { + "epoch": 5.128456735057984, + "grad_norm": 1.3464322090148926, + "learning_rate": 4.879904875148633e-06, + "loss": 0.4069, + "step": 2159 + }, + { + "epoch": 5.130835563484983, + "grad_norm": 1.249952793121338, + "learning_rate": 4.877526753864447e-06, + "loss": 0.3737, + "step": 2160 + }, + { + "epoch": 5.1332143919119835, + "grad_norm": 1.206658124923706, + "learning_rate": 4.875148632580262e-06, + "loss": 0.3181, + "step": 2161 + }, + { + "epoch": 5.135593220338983, + "grad_norm": 1.3309097290039062, + "learning_rate": 4.872770511296077e-06, + "loss": 0.3794, + "step": 2162 + }, + { + "epoch": 5.137972048765983, + "grad_norm": 1.1474791765213013, + "learning_rate": 4.870392390011891e-06, + "loss": 0.3299, + "step": 2163 + }, + { + "epoch": 5.140350877192983, + "grad_norm": 1.1475800275802612, + "learning_rate": 4.868014268727706e-06, + "loss": 0.2878, + "step": 2164 + }, + { + "epoch": 5.142729705619982, + "grad_norm": 1.2458198070526123, + "learning_rate": 4.8656361474435196e-06, + "loss": 0.393, + "step": 2165 + }, + { + "epoch": 5.145108534046982, + "grad_norm": 1.0997475385665894, + "learning_rate": 4.863258026159334e-06, + "loss": 0.2976, + "step": 2166 + }, + { + "epoch": 5.147487362473981, + "grad_norm": 1.3968905210494995, + "learning_rate": 4.86087990487515e-06, + "loss": 0.3844, + "step": 2167 + }, + { + "epoch": 5.1498661909009815, + "grad_norm": 1.1975730657577515, + "learning_rate": 4.858501783590963e-06, + "loss": 0.3366, + "step": 2168 + }, + { + "epoch": 5.152245019327981, + "grad_norm": 1.1442532539367676, + "learning_rate": 4.856123662306778e-06, + "loss": 0.3227, + "step": 2169 + }, + { + "epoch": 5.154623847754981, + "grad_norm": 1.4302014112472534, + "learning_rate": 4.853745541022593e-06, + "loss": 0.3836, + "step": 2170 + }, + { + "epoch": 5.15700267618198, + "grad_norm": 1.269264578819275, + "learning_rate": 4.851367419738407e-06, + "loss": 0.3957, + "step": 2171 + }, + { + "epoch": 5.15938150460898, + "grad_norm": 1.3459832668304443, + "learning_rate": 4.848989298454221e-06, + "loss": 0.4345, + "step": 2172 + }, + { + "epoch": 5.161760333035979, + "grad_norm": 1.229471206665039, + "learning_rate": 4.846611177170036e-06, + "loss": 0.3591, + "step": 2173 + }, + { + "epoch": 5.1641391614629795, + "grad_norm": 1.2357522249221802, + "learning_rate": 4.844233055885851e-06, + "loss": 0.3387, + "step": 2174 + }, + { + "epoch": 5.16651798988998, + "grad_norm": 1.0728986263275146, + "learning_rate": 4.841854934601665e-06, + "loss": 0.3092, + "step": 2175 + }, + { + "epoch": 5.168896818316979, + "grad_norm": 1.1082243919372559, + "learning_rate": 4.8394768133174794e-06, + "loss": 0.3209, + "step": 2176 + }, + { + "epoch": 5.171275646743979, + "grad_norm": 1.2395209074020386, + "learning_rate": 4.837098692033294e-06, + "loss": 0.3693, + "step": 2177 + }, + { + "epoch": 5.173654475170978, + "grad_norm": 1.145666241645813, + "learning_rate": 4.834720570749109e-06, + "loss": 0.3513, + "step": 2178 + }, + { + "epoch": 5.176033303597978, + "grad_norm": 1.0506113767623901, + "learning_rate": 4.832342449464923e-06, + "loss": 0.4125, + "step": 2179 + }, + { + "epoch": 5.178412132024977, + "grad_norm": 1.2678948640823364, + "learning_rate": 4.829964328180737e-06, + "loss": 0.3267, + "step": 2180 + }, + { + "epoch": 5.1807909604519775, + "grad_norm": 1.1697998046875, + "learning_rate": 4.8275862068965525e-06, + "loss": 0.3511, + "step": 2181 + }, + { + "epoch": 5.183169788878977, + "grad_norm": 1.1346150636672974, + "learning_rate": 4.825208085612367e-06, + "loss": 0.3201, + "step": 2182 + }, + { + "epoch": 5.185548617305977, + "grad_norm": 1.278774380683899, + "learning_rate": 4.822829964328181e-06, + "loss": 0.3498, + "step": 2183 + }, + { + "epoch": 5.187927445732977, + "grad_norm": 1.2950605154037476, + "learning_rate": 4.8204518430439955e-06, + "loss": 0.3321, + "step": 2184 + }, + { + "epoch": 5.190306274159976, + "grad_norm": 1.1749104261398315, + "learning_rate": 4.81807372175981e-06, + "loss": 0.3555, + "step": 2185 + }, + { + "epoch": 5.192685102586976, + "grad_norm": 1.2565380334854126, + "learning_rate": 4.815695600475625e-06, + "loss": 0.3421, + "step": 2186 + }, + { + "epoch": 5.1950639310139755, + "grad_norm": 1.3786242008209229, + "learning_rate": 4.8133174791914385e-06, + "loss": 0.3946, + "step": 2187 + }, + { + "epoch": 5.197442759440976, + "grad_norm": 1.238984227180481, + "learning_rate": 4.810939357907254e-06, + "loss": 0.3903, + "step": 2188 + }, + { + "epoch": 5.199821587867975, + "grad_norm": 1.165662407875061, + "learning_rate": 4.8085612366230685e-06, + "loss": 0.3205, + "step": 2189 + }, + { + "epoch": 5.202200416294975, + "grad_norm": 1.1436853408813477, + "learning_rate": 4.806183115338882e-06, + "loss": 0.3158, + "step": 2190 + }, + { + "epoch": 5.204579244721974, + "grad_norm": 1.1758873462677002, + "learning_rate": 4.803804994054697e-06, + "loss": 0.2595, + "step": 2191 + }, + { + "epoch": 5.206958073148974, + "grad_norm": 1.2953070402145386, + "learning_rate": 4.8014268727705115e-06, + "loss": 0.4112, + "step": 2192 + }, + { + "epoch": 5.209336901575973, + "grad_norm": 1.1498371362686157, + "learning_rate": 4.799048751486326e-06, + "loss": 0.3116, + "step": 2193 + }, + { + "epoch": 5.2117157300029735, + "grad_norm": 1.2843515872955322, + "learning_rate": 4.796670630202141e-06, + "loss": 0.4188, + "step": 2194 + }, + { + "epoch": 5.214094558429974, + "grad_norm": 1.3652373552322388, + "learning_rate": 4.794292508917955e-06, + "loss": 0.359, + "step": 2195 + }, + { + "epoch": 5.216473386856973, + "grad_norm": 1.260819673538208, + "learning_rate": 4.79191438763377e-06, + "loss": 0.3286, + "step": 2196 + }, + { + "epoch": 5.218852215283973, + "grad_norm": 1.1289883852005005, + "learning_rate": 4.789536266349585e-06, + "loss": 0.3078, + "step": 2197 + }, + { + "epoch": 5.221231043710972, + "grad_norm": 1.2726932764053345, + "learning_rate": 4.787158145065398e-06, + "loss": 0.3772, + "step": 2198 + }, + { + "epoch": 5.223609872137972, + "grad_norm": 1.1747164726257324, + "learning_rate": 4.784780023781213e-06, + "loss": 0.3628, + "step": 2199 + }, + { + "epoch": 5.2259887005649714, + "grad_norm": 1.2917836904525757, + "learning_rate": 4.7824019024970276e-06, + "loss": 0.3675, + "step": 2200 + }, + { + "epoch": 5.2259887005649714, + "eval_loss": 0.4263933598995209, + "eval_runtime": 23.0998, + "eval_samples_per_second": 32.381, + "eval_steps_per_second": 16.191, + "step": 2200 + }, + { + "epoch": 5.2283675289919715, + "grad_norm": 1.25987708568573, + "learning_rate": 4.780023781212842e-06, + "loss": 0.3765, + "step": 2201 + }, + { + "epoch": 5.230746357418971, + "grad_norm": 1.3081939220428467, + "learning_rate": 4.777645659928657e-06, + "loss": 0.3041, + "step": 2202 + }, + { + "epoch": 5.233125185845971, + "grad_norm": 1.3990867137908936, + "learning_rate": 4.775267538644471e-06, + "loss": 0.3255, + "step": 2203 + }, + { + "epoch": 5.235504014272971, + "grad_norm": 1.243141531944275, + "learning_rate": 4.772889417360286e-06, + "loss": 0.2974, + "step": 2204 + }, + { + "epoch": 5.23788284269997, + "grad_norm": 1.3374601602554321, + "learning_rate": 4.770511296076101e-06, + "loss": 0.3435, + "step": 2205 + }, + { + "epoch": 5.24026167112697, + "grad_norm": 1.2881165742874146, + "learning_rate": 4.768133174791914e-06, + "loss": 0.3586, + "step": 2206 + }, + { + "epoch": 5.2426404995539695, + "grad_norm": 1.2549759149551392, + "learning_rate": 4.765755053507729e-06, + "loss": 0.3457, + "step": 2207 + }, + { + "epoch": 5.24501932798097, + "grad_norm": 1.2617493867874146, + "learning_rate": 4.763376932223544e-06, + "loss": 0.3499, + "step": 2208 + }, + { + "epoch": 5.247398156407969, + "grad_norm": 1.1265956163406372, + "learning_rate": 4.760998810939358e-06, + "loss": 0.3347, + "step": 2209 + }, + { + "epoch": 5.249776984834969, + "grad_norm": 1.3432070016860962, + "learning_rate": 4.758620689655173e-06, + "loss": 0.3234, + "step": 2210 + }, + { + "epoch": 5.252155813261968, + "grad_norm": 1.1490808725357056, + "learning_rate": 4.7562425683709874e-06, + "loss": 0.3915, + "step": 2211 + }, + { + "epoch": 5.254534641688968, + "grad_norm": 1.3073781728744507, + "learning_rate": 4.753864447086802e-06, + "loss": 0.412, + "step": 2212 + }, + { + "epoch": 5.256913470115968, + "grad_norm": 1.4428491592407227, + "learning_rate": 4.751486325802616e-06, + "loss": 0.4226, + "step": 2213 + }, + { + "epoch": 5.2592922985429675, + "grad_norm": 1.4250330924987793, + "learning_rate": 4.74910820451843e-06, + "loss": 0.3676, + "step": 2214 + }, + { + "epoch": 5.261671126969968, + "grad_norm": 1.3372389078140259, + "learning_rate": 4.746730083234246e-06, + "loss": 0.3614, + "step": 2215 + }, + { + "epoch": 5.264049955396967, + "grad_norm": 1.1524739265441895, + "learning_rate": 4.74435196195006e-06, + "loss": 0.3024, + "step": 2216 + }, + { + "epoch": 5.266428783823967, + "grad_norm": 1.1237945556640625, + "learning_rate": 4.741973840665874e-06, + "loss": 0.349, + "step": 2217 + }, + { + "epoch": 5.268807612250966, + "grad_norm": 1.2900390625, + "learning_rate": 4.739595719381689e-06, + "loss": 0.3601, + "step": 2218 + }, + { + "epoch": 5.271186440677966, + "grad_norm": 1.2037912607192993, + "learning_rate": 4.7372175980975035e-06, + "loss": 0.3143, + "step": 2219 + }, + { + "epoch": 5.2735652691049655, + "grad_norm": 1.20513117313385, + "learning_rate": 4.734839476813318e-06, + "loss": 0.3718, + "step": 2220 + }, + { + "epoch": 5.275944097531966, + "grad_norm": 1.423080325126648, + "learning_rate": 4.732461355529132e-06, + "loss": 0.3427, + "step": 2221 + }, + { + "epoch": 5.278322925958965, + "grad_norm": 1.2976034879684448, + "learning_rate": 4.730083234244947e-06, + "loss": 0.3568, + "step": 2222 + }, + { + "epoch": 5.280701754385965, + "grad_norm": 1.3433586359024048, + "learning_rate": 4.727705112960762e-06, + "loss": 0.3289, + "step": 2223 + }, + { + "epoch": 5.283080582812965, + "grad_norm": 1.0534664392471313, + "learning_rate": 4.725326991676576e-06, + "loss": 0.2748, + "step": 2224 + }, + { + "epoch": 5.285459411239964, + "grad_norm": 1.50064218044281, + "learning_rate": 4.72294887039239e-06, + "loss": 0.3747, + "step": 2225 + }, + { + "epoch": 5.287838239666964, + "grad_norm": 1.4055958986282349, + "learning_rate": 4.720570749108205e-06, + "loss": 0.431, + "step": 2226 + }, + { + "epoch": 5.2902170680939635, + "grad_norm": 1.066025972366333, + "learning_rate": 4.7181926278240195e-06, + "loss": 0.3579, + "step": 2227 + }, + { + "epoch": 5.292595896520964, + "grad_norm": 1.2729994058609009, + "learning_rate": 4.715814506539834e-06, + "loss": 0.3882, + "step": 2228 + }, + { + "epoch": 5.294974724947963, + "grad_norm": 1.3019053936004639, + "learning_rate": 4.713436385255649e-06, + "loss": 0.3377, + "step": 2229 + }, + { + "epoch": 5.297353553374963, + "grad_norm": 1.2607390880584717, + "learning_rate": 4.711058263971463e-06, + "loss": 0.3207, + "step": 2230 + }, + { + "epoch": 5.299732381801962, + "grad_norm": 1.1079074144363403, + "learning_rate": 4.708680142687277e-06, + "loss": 0.348, + "step": 2231 + }, + { + "epoch": 5.302111210228962, + "grad_norm": 1.3000109195709229, + "learning_rate": 4.706302021403092e-06, + "loss": 0.4255, + "step": 2232 + }, + { + "epoch": 5.3044900386559615, + "grad_norm": 1.152220606803894, + "learning_rate": 4.703923900118906e-06, + "loss": 0.3173, + "step": 2233 + }, + { + "epoch": 5.306868867082962, + "grad_norm": 1.3511146306991577, + "learning_rate": 4.701545778834721e-06, + "loss": 0.3479, + "step": 2234 + }, + { + "epoch": 5.309247695509962, + "grad_norm": 1.3266832828521729, + "learning_rate": 4.6991676575505356e-06, + "loss": 0.3153, + "step": 2235 + }, + { + "epoch": 5.311626523936961, + "grad_norm": 1.1276640892028809, + "learning_rate": 4.69678953626635e-06, + "loss": 0.3122, + "step": 2236 + }, + { + "epoch": 5.314005352363961, + "grad_norm": 1.3814538717269897, + "learning_rate": 4.694411414982165e-06, + "loss": 0.3348, + "step": 2237 + }, + { + "epoch": 5.31638418079096, + "grad_norm": 1.2050621509552002, + "learning_rate": 4.692033293697979e-06, + "loss": 0.3372, + "step": 2238 + }, + { + "epoch": 5.31876300921796, + "grad_norm": 1.6038686037063599, + "learning_rate": 4.689655172413793e-06, + "loss": 0.4251, + "step": 2239 + }, + { + "epoch": 5.3211418376449595, + "grad_norm": 1.313639521598816, + "learning_rate": 4.687277051129608e-06, + "loss": 0.3261, + "step": 2240 + }, + { + "epoch": 5.32352066607196, + "grad_norm": 1.396295428276062, + "learning_rate": 4.684898929845422e-06, + "loss": 0.4065, + "step": 2241 + }, + { + "epoch": 5.325899494498959, + "grad_norm": 1.3664578199386597, + "learning_rate": 4.682520808561237e-06, + "loss": 0.3745, + "step": 2242 + }, + { + "epoch": 5.328278322925959, + "grad_norm": 1.3295447826385498, + "learning_rate": 4.680142687277052e-06, + "loss": 0.3632, + "step": 2243 + }, + { + "epoch": 5.330657151352959, + "grad_norm": 1.1266640424728394, + "learning_rate": 4.677764565992866e-06, + "loss": 0.3305, + "step": 2244 + }, + { + "epoch": 5.333035979779958, + "grad_norm": 1.1757619380950928, + "learning_rate": 4.675386444708681e-06, + "loss": 0.3855, + "step": 2245 + }, + { + "epoch": 5.335414808206958, + "grad_norm": 1.3831630945205688, + "learning_rate": 4.673008323424495e-06, + "loss": 0.4033, + "step": 2246 + }, + { + "epoch": 5.337793636633958, + "grad_norm": 1.148657202720642, + "learning_rate": 4.670630202140309e-06, + "loss": 0.3051, + "step": 2247 + }, + { + "epoch": 5.340172465060958, + "grad_norm": 1.2635345458984375, + "learning_rate": 4.668252080856124e-06, + "loss": 0.3405, + "step": 2248 + }, + { + "epoch": 5.342551293487957, + "grad_norm": 1.2051678895950317, + "learning_rate": 4.665873959571938e-06, + "loss": 0.3458, + "step": 2249 + }, + { + "epoch": 5.344930121914957, + "grad_norm": 1.3177422285079956, + "learning_rate": 4.663495838287753e-06, + "loss": 0.3937, + "step": 2250 + }, + { + "epoch": 5.344930121914957, + "eval_loss": 0.42643940448760986, + "eval_runtime": 23.1393, + "eval_samples_per_second": 32.326, + "eval_steps_per_second": 16.163, + "step": 2250 + }, + { + "epoch": 5.347308950341956, + "grad_norm": 1.190124273300171, + "learning_rate": 4.661117717003568e-06, + "loss": 0.3611, + "step": 2251 + }, + { + "epoch": 5.349687778768956, + "grad_norm": 1.4143081903457642, + "learning_rate": 4.658739595719382e-06, + "loss": 0.4022, + "step": 2252 + }, + { + "epoch": 5.352066607195956, + "grad_norm": 1.1534010171890259, + "learning_rate": 4.656361474435197e-06, + "loss": 0.3406, + "step": 2253 + }, + { + "epoch": 5.354445435622956, + "grad_norm": 1.2996052503585815, + "learning_rate": 4.653983353151011e-06, + "loss": 0.3496, + "step": 2254 + }, + { + "epoch": 5.356824264049956, + "grad_norm": 1.630771517753601, + "learning_rate": 4.651605231866825e-06, + "loss": 0.4272, + "step": 2255 + }, + { + "epoch": 5.359203092476955, + "grad_norm": 1.213273525238037, + "learning_rate": 4.649227110582641e-06, + "loss": 0.3167, + "step": 2256 + }, + { + "epoch": 5.361581920903955, + "grad_norm": 1.2760071754455566, + "learning_rate": 4.6468489892984545e-06, + "loss": 0.3432, + "step": 2257 + }, + { + "epoch": 5.363960749330954, + "grad_norm": 1.2334431409835815, + "learning_rate": 4.644470868014269e-06, + "loss": 0.3007, + "step": 2258 + }, + { + "epoch": 5.366339577757954, + "grad_norm": 1.283540964126587, + "learning_rate": 4.642092746730084e-06, + "loss": 0.3274, + "step": 2259 + }, + { + "epoch": 5.3687184061849536, + "grad_norm": 1.40235435962677, + "learning_rate": 4.639714625445898e-06, + "loss": 0.3693, + "step": 2260 + }, + { + "epoch": 5.371097234611954, + "grad_norm": 1.3751204013824463, + "learning_rate": 4.637336504161712e-06, + "loss": 0.4083, + "step": 2261 + }, + { + "epoch": 5.373476063038954, + "grad_norm": 1.1771738529205322, + "learning_rate": 4.634958382877527e-06, + "loss": 0.349, + "step": 2262 + }, + { + "epoch": 5.375854891465953, + "grad_norm": 1.3026350736618042, + "learning_rate": 4.632580261593342e-06, + "loss": 0.3149, + "step": 2263 + }, + { + "epoch": 5.378233719892953, + "grad_norm": 1.1984606981277466, + "learning_rate": 4.630202140309156e-06, + "loss": 0.3753, + "step": 2264 + }, + { + "epoch": 5.380612548319952, + "grad_norm": 1.2461832761764526, + "learning_rate": 4.6278240190249705e-06, + "loss": 0.3749, + "step": 2265 + }, + { + "epoch": 5.382991376746952, + "grad_norm": 1.3668829202651978, + "learning_rate": 4.625445897740785e-06, + "loss": 0.3893, + "step": 2266 + }, + { + "epoch": 5.385370205173952, + "grad_norm": 1.3800373077392578, + "learning_rate": 4.6230677764566e-06, + "loss": 0.4348, + "step": 2267 + }, + { + "epoch": 5.387749033600952, + "grad_norm": 1.349463701248169, + "learning_rate": 4.620689655172414e-06, + "loss": 0.4035, + "step": 2268 + }, + { + "epoch": 5.390127862027951, + "grad_norm": 1.4032188653945923, + "learning_rate": 4.618311533888229e-06, + "loss": 0.3954, + "step": 2269 + }, + { + "epoch": 5.392506690454951, + "grad_norm": 1.1075043678283691, + "learning_rate": 4.6159334126040436e-06, + "loss": 0.3766, + "step": 2270 + }, + { + "epoch": 5.39488551888195, + "grad_norm": 1.3073793649673462, + "learning_rate": 4.613555291319858e-06, + "loss": 0.4267, + "step": 2271 + }, + { + "epoch": 5.39726434730895, + "grad_norm": 1.1431479454040527, + "learning_rate": 4.611177170035672e-06, + "loss": 0.3512, + "step": 2272 + }, + { + "epoch": 5.39964317573595, + "grad_norm": 1.4438538551330566, + "learning_rate": 4.6087990487514865e-06, + "loss": 0.3441, + "step": 2273 + }, + { + "epoch": 5.40202200416295, + "grad_norm": 1.1673023700714111, + "learning_rate": 4.606420927467301e-06, + "loss": 0.3472, + "step": 2274 + }, + { + "epoch": 5.40440083258995, + "grad_norm": 1.2464430332183838, + "learning_rate": 4.604042806183116e-06, + "loss": 0.3452, + "step": 2275 + }, + { + "epoch": 5.406779661016949, + "grad_norm": 1.345447063446045, + "learning_rate": 4.60166468489893e-06, + "loss": 0.3439, + "step": 2276 + }, + { + "epoch": 5.409158489443949, + "grad_norm": 1.2835646867752075, + "learning_rate": 4.599286563614745e-06, + "loss": 0.3146, + "step": 2277 + }, + { + "epoch": 5.411537317870948, + "grad_norm": 1.2727875709533691, + "learning_rate": 4.59690844233056e-06, + "loss": 0.3934, + "step": 2278 + }, + { + "epoch": 5.413916146297948, + "grad_norm": 1.5119589567184448, + "learning_rate": 4.594530321046373e-06, + "loss": 0.4441, + "step": 2279 + }, + { + "epoch": 5.416294974724948, + "grad_norm": 1.453375220298767, + "learning_rate": 4.592152199762188e-06, + "loss": 0.3949, + "step": 2280 + }, + { + "epoch": 5.418673803151948, + "grad_norm": 1.1675848960876465, + "learning_rate": 4.5897740784780026e-06, + "loss": 0.3805, + "step": 2281 + }, + { + "epoch": 5.421052631578947, + "grad_norm": 1.296386957168579, + "learning_rate": 4.587395957193817e-06, + "loss": 0.3376, + "step": 2282 + }, + { + "epoch": 5.423431460005947, + "grad_norm": 1.1445293426513672, + "learning_rate": 4.585017835909632e-06, + "loss": 0.3338, + "step": 2283 + }, + { + "epoch": 5.425810288432947, + "grad_norm": 1.2588094472885132, + "learning_rate": 4.582639714625446e-06, + "loss": 0.3706, + "step": 2284 + }, + { + "epoch": 5.428189116859946, + "grad_norm": 1.309922695159912, + "learning_rate": 4.580261593341261e-06, + "loss": 0.3356, + "step": 2285 + }, + { + "epoch": 5.430567945286946, + "grad_norm": 1.3413279056549072, + "learning_rate": 4.577883472057076e-06, + "loss": 0.3203, + "step": 2286 + }, + { + "epoch": 5.432946773713946, + "grad_norm": 1.1806575059890747, + "learning_rate": 4.575505350772889e-06, + "loss": 0.3679, + "step": 2287 + }, + { + "epoch": 5.435325602140946, + "grad_norm": 1.1468663215637207, + "learning_rate": 4.573127229488704e-06, + "loss": 0.3403, + "step": 2288 + }, + { + "epoch": 5.437704430567945, + "grad_norm": 1.177342414855957, + "learning_rate": 4.570749108204519e-06, + "loss": 0.3065, + "step": 2289 + }, + { + "epoch": 5.440083258994945, + "grad_norm": 1.4605060815811157, + "learning_rate": 4.568370986920333e-06, + "loss": 0.381, + "step": 2290 + }, + { + "epoch": 5.442462087421944, + "grad_norm": 1.3808484077453613, + "learning_rate": 4.565992865636148e-06, + "loss": 0.3634, + "step": 2291 + }, + { + "epoch": 5.444840915848944, + "grad_norm": 1.2701420783996582, + "learning_rate": 4.5636147443519624e-06, + "loss": 0.3087, + "step": 2292 + }, + { + "epoch": 5.4472197442759445, + "grad_norm": 1.2494807243347168, + "learning_rate": 4.561236623067777e-06, + "loss": 0.3331, + "step": 2293 + }, + { + "epoch": 5.449598572702944, + "grad_norm": 1.1639362573623657, + "learning_rate": 4.558858501783591e-06, + "loss": 0.3368, + "step": 2294 + }, + { + "epoch": 5.451977401129944, + "grad_norm": 1.3209528923034668, + "learning_rate": 4.5564803804994054e-06, + "loss": 0.3719, + "step": 2295 + }, + { + "epoch": 5.454356229556943, + "grad_norm": 1.4628571271896362, + "learning_rate": 4.55410225921522e-06, + "loss": 0.4303, + "step": 2296 + }, + { + "epoch": 5.456735057983943, + "grad_norm": 1.196649432182312, + "learning_rate": 4.551724137931035e-06, + "loss": 0.3473, + "step": 2297 + }, + { + "epoch": 5.459113886410942, + "grad_norm": 1.4699997901916504, + "learning_rate": 4.549346016646849e-06, + "loss": 0.3897, + "step": 2298 + }, + { + "epoch": 5.461492714837942, + "grad_norm": 1.19797682762146, + "learning_rate": 4.546967895362664e-06, + "loss": 0.3652, + "step": 2299 + }, + { + "epoch": 5.463871543264942, + "grad_norm": 1.2875802516937256, + "learning_rate": 4.5445897740784785e-06, + "loss": 0.3484, + "step": 2300 + }, + { + "epoch": 5.463871543264942, + "eval_loss": 0.4249710738658905, + "eval_runtime": 23.1024, + "eval_samples_per_second": 32.378, + "eval_steps_per_second": 16.189, + "step": 2300 + }, + { + "epoch": 5.466250371691942, + "grad_norm": 1.2586512565612793, + "learning_rate": 4.542211652794293e-06, + "loss": 0.3853, + "step": 2301 + }, + { + "epoch": 5.468629200118942, + "grad_norm": 1.2908281087875366, + "learning_rate": 4.539833531510107e-06, + "loss": 0.3087, + "step": 2302 + }, + { + "epoch": 5.471008028545941, + "grad_norm": 1.1374825239181519, + "learning_rate": 4.5374554102259215e-06, + "loss": 0.3135, + "step": 2303 + }, + { + "epoch": 5.473386856972941, + "grad_norm": 1.1026619672775269, + "learning_rate": 4.535077288941737e-06, + "loss": 0.313, + "step": 2304 + }, + { + "epoch": 5.47576568539994, + "grad_norm": 1.4172130823135376, + "learning_rate": 4.532699167657551e-06, + "loss": 0.3299, + "step": 2305 + }, + { + "epoch": 5.4781445138269405, + "grad_norm": 1.1859735250473022, + "learning_rate": 4.530321046373365e-06, + "loss": 0.3511, + "step": 2306 + }, + { + "epoch": 5.48052334225394, + "grad_norm": 1.0702934265136719, + "learning_rate": 4.52794292508918e-06, + "loss": 0.3304, + "step": 2307 + }, + { + "epoch": 5.48290217068094, + "grad_norm": 1.3154021501541138, + "learning_rate": 4.5255648038049945e-06, + "loss": 0.3181, + "step": 2308 + }, + { + "epoch": 5.485280999107939, + "grad_norm": 1.3425809144973755, + "learning_rate": 4.523186682520809e-06, + "loss": 0.3167, + "step": 2309 + }, + { + "epoch": 5.487659827534939, + "grad_norm": 1.1865429878234863, + "learning_rate": 4.520808561236624e-06, + "loss": 0.3019, + "step": 2310 + }, + { + "epoch": 5.490038655961939, + "grad_norm": 1.1835495233535767, + "learning_rate": 4.518430439952438e-06, + "loss": 0.3076, + "step": 2311 + }, + { + "epoch": 5.492417484388938, + "grad_norm": 1.1418440341949463, + "learning_rate": 4.516052318668253e-06, + "loss": 0.3495, + "step": 2312 + }, + { + "epoch": 5.4947963128159385, + "grad_norm": 1.3444372415542603, + "learning_rate": 4.513674197384067e-06, + "loss": 0.3204, + "step": 2313 + }, + { + "epoch": 5.497175141242938, + "grad_norm": 1.1766853332519531, + "learning_rate": 4.511296076099881e-06, + "loss": 0.3615, + "step": 2314 + }, + { + "epoch": 5.499553969669938, + "grad_norm": 1.216193437576294, + "learning_rate": 4.508917954815696e-06, + "loss": 0.3333, + "step": 2315 + }, + { + "epoch": 5.501932798096937, + "grad_norm": 1.371532917022705, + "learning_rate": 4.5065398335315106e-06, + "loss": 0.4182, + "step": 2316 + }, + { + "epoch": 5.504311626523937, + "grad_norm": 1.2848918437957764, + "learning_rate": 4.504161712247325e-06, + "loss": 0.3219, + "step": 2317 + }, + { + "epoch": 5.506690454950936, + "grad_norm": 1.552178144454956, + "learning_rate": 4.50178359096314e-06, + "loss": 0.4007, + "step": 2318 + }, + { + "epoch": 5.509069283377936, + "grad_norm": 1.3235853910446167, + "learning_rate": 4.499405469678954e-06, + "loss": 0.3884, + "step": 2319 + }, + { + "epoch": 5.5114481118049365, + "grad_norm": 1.2004040479660034, + "learning_rate": 4.497027348394768e-06, + "loss": 0.3587, + "step": 2320 + }, + { + "epoch": 5.513826940231936, + "grad_norm": 1.2903748750686646, + "learning_rate": 4.494649227110583e-06, + "loss": 0.3074, + "step": 2321 + }, + { + "epoch": 5.516205768658936, + "grad_norm": 1.2189632654190063, + "learning_rate": 4.492271105826397e-06, + "loss": 0.307, + "step": 2322 + }, + { + "epoch": 5.518584597085935, + "grad_norm": 1.4042412042617798, + "learning_rate": 4.489892984542212e-06, + "loss": 0.4339, + "step": 2323 + }, + { + "epoch": 5.520963425512935, + "grad_norm": 1.4478718042373657, + "learning_rate": 4.487514863258027e-06, + "loss": 0.3984, + "step": 2324 + }, + { + "epoch": 5.523342253939934, + "grad_norm": 1.3115742206573486, + "learning_rate": 4.485136741973841e-06, + "loss": 0.3242, + "step": 2325 + }, + { + "epoch": 5.5257210823669345, + "grad_norm": 1.19297456741333, + "learning_rate": 4.482758620689656e-06, + "loss": 0.3608, + "step": 2326 + }, + { + "epoch": 5.528099910793934, + "grad_norm": 1.3011291027069092, + "learning_rate": 4.4803804994054704e-06, + "loss": 0.3594, + "step": 2327 + }, + { + "epoch": 5.530478739220934, + "grad_norm": 1.1068850755691528, + "learning_rate": 4.478002378121284e-06, + "loss": 0.3012, + "step": 2328 + }, + { + "epoch": 5.532857567647933, + "grad_norm": 1.3074021339416504, + "learning_rate": 4.475624256837099e-06, + "loss": 0.3617, + "step": 2329 + }, + { + "epoch": 5.535236396074933, + "grad_norm": 1.5897773504257202, + "learning_rate": 4.4732461355529134e-06, + "loss": 0.431, + "step": 2330 + }, + { + "epoch": 5.537615224501932, + "grad_norm": 1.4138380289077759, + "learning_rate": 4.470868014268728e-06, + "loss": 0.4596, + "step": 2331 + }, + { + "epoch": 5.539994052928932, + "grad_norm": 1.0935038328170776, + "learning_rate": 4.468489892984543e-06, + "loss": 0.3513, + "step": 2332 + }, + { + "epoch": 5.5423728813559325, + "grad_norm": 1.2140233516693115, + "learning_rate": 4.466111771700357e-06, + "loss": 0.343, + "step": 2333 + }, + { + "epoch": 5.544751709782932, + "grad_norm": 1.2504364252090454, + "learning_rate": 4.463733650416172e-06, + "loss": 0.3904, + "step": 2334 + }, + { + "epoch": 5.547130538209932, + "grad_norm": 1.1587966680526733, + "learning_rate": 4.461355529131986e-06, + "loss": 0.3323, + "step": 2335 + }, + { + "epoch": 5.549509366636931, + "grad_norm": 1.093661904335022, + "learning_rate": 4.4589774078478e-06, + "loss": 0.2888, + "step": 2336 + }, + { + "epoch": 5.551888195063931, + "grad_norm": 1.272356390953064, + "learning_rate": 4.456599286563615e-06, + "loss": 0.322, + "step": 2337 + }, + { + "epoch": 5.55426702349093, + "grad_norm": 1.5073039531707764, + "learning_rate": 4.4542211652794295e-06, + "loss": 0.4065, + "step": 2338 + }, + { + "epoch": 5.5566458519179305, + "grad_norm": 1.272123098373413, + "learning_rate": 4.451843043995244e-06, + "loss": 0.375, + "step": 2339 + }, + { + "epoch": 5.55902468034493, + "grad_norm": 1.2156635522842407, + "learning_rate": 4.449464922711059e-06, + "loss": 0.309, + "step": 2340 + }, + { + "epoch": 5.56140350877193, + "grad_norm": 1.267327904701233, + "learning_rate": 4.447086801426873e-06, + "loss": 0.3422, + "step": 2341 + }, + { + "epoch": 5.56378233719893, + "grad_norm": 1.203092336654663, + "learning_rate": 4.444708680142688e-06, + "loss": 0.3747, + "step": 2342 + }, + { + "epoch": 5.566161165625929, + "grad_norm": 1.14577054977417, + "learning_rate": 4.442330558858502e-06, + "loss": 0.3616, + "step": 2343 + }, + { + "epoch": 5.568539994052929, + "grad_norm": 1.2138999700546265, + "learning_rate": 4.439952437574316e-06, + "loss": 0.3111, + "step": 2344 + }, + { + "epoch": 5.570918822479928, + "grad_norm": 1.3812578916549683, + "learning_rate": 4.437574316290132e-06, + "loss": 0.3587, + "step": 2345 + }, + { + "epoch": 5.5732976509069285, + "grad_norm": 1.3237173557281494, + "learning_rate": 4.4351961950059455e-06, + "loss": 0.4712, + "step": 2346 + }, + { + "epoch": 5.575676479333928, + "grad_norm": 1.4613207578659058, + "learning_rate": 4.43281807372176e-06, + "loss": 0.3634, + "step": 2347 + }, + { + "epoch": 5.578055307760928, + "grad_norm": 1.2635977268218994, + "learning_rate": 4.430439952437575e-06, + "loss": 0.3715, + "step": 2348 + }, + { + "epoch": 5.580434136187927, + "grad_norm": 1.306638240814209, + "learning_rate": 4.428061831153389e-06, + "loss": 0.4187, + "step": 2349 + }, + { + "epoch": 5.582812964614927, + "grad_norm": 1.2098315954208374, + "learning_rate": 4.425683709869203e-06, + "loss": 0.3714, + "step": 2350 + }, + { + "epoch": 5.582812964614927, + "eval_loss": 0.4250815808773041, + "eval_runtime": 23.0354, + "eval_samples_per_second": 32.472, + "eval_steps_per_second": 16.236, + "step": 2350 + }, + { + "epoch": 5.585191793041927, + "grad_norm": 1.1911180019378662, + "learning_rate": 4.4233055885850186e-06, + "loss": 0.335, + "step": 2351 + }, + { + "epoch": 5.5875706214689265, + "grad_norm": 1.4686857461929321, + "learning_rate": 4.420927467300833e-06, + "loss": 0.3364, + "step": 2352 + }, + { + "epoch": 5.589949449895927, + "grad_norm": 1.3566080331802368, + "learning_rate": 4.418549346016647e-06, + "loss": 0.3577, + "step": 2353 + }, + { + "epoch": 5.592328278322926, + "grad_norm": 1.3929115533828735, + "learning_rate": 4.4161712247324615e-06, + "loss": 0.3854, + "step": 2354 + }, + { + "epoch": 5.594707106749926, + "grad_norm": 1.2800372838974, + "learning_rate": 4.413793103448276e-06, + "loss": 0.2941, + "step": 2355 + }, + { + "epoch": 5.597085935176925, + "grad_norm": 1.3730703592300415, + "learning_rate": 4.411414982164091e-06, + "loss": 0.4061, + "step": 2356 + }, + { + "epoch": 5.599464763603925, + "grad_norm": 1.0275835990905762, + "learning_rate": 4.409036860879905e-06, + "loss": 0.2936, + "step": 2357 + }, + { + "epoch": 5.601843592030924, + "grad_norm": 1.2510380744934082, + "learning_rate": 4.40665873959572e-06, + "loss": 0.3875, + "step": 2358 + }, + { + "epoch": 5.6042224204579245, + "grad_norm": 1.2467325925827026, + "learning_rate": 4.404280618311535e-06, + "loss": 0.3685, + "step": 2359 + }, + { + "epoch": 5.606601248884925, + "grad_norm": 1.4930354356765747, + "learning_rate": 4.401902497027349e-06, + "loss": 0.4665, + "step": 2360 + }, + { + "epoch": 5.608980077311924, + "grad_norm": 1.3357295989990234, + "learning_rate": 4.399524375743163e-06, + "loss": 0.3966, + "step": 2361 + }, + { + "epoch": 5.611358905738924, + "grad_norm": 1.1427794694900513, + "learning_rate": 4.397146254458978e-06, + "loss": 0.3133, + "step": 2362 + }, + { + "epoch": 5.613737734165923, + "grad_norm": 1.4235645532608032, + "learning_rate": 4.394768133174792e-06, + "loss": 0.4207, + "step": 2363 + }, + { + "epoch": 5.616116562592923, + "grad_norm": 1.334149718284607, + "learning_rate": 4.392390011890607e-06, + "loss": 0.3753, + "step": 2364 + }, + { + "epoch": 5.6184953910199225, + "grad_norm": 1.101811170578003, + "learning_rate": 4.390011890606421e-06, + "loss": 0.3082, + "step": 2365 + }, + { + "epoch": 5.620874219446923, + "grad_norm": 1.5656487941741943, + "learning_rate": 4.387633769322236e-06, + "loss": 0.3393, + "step": 2366 + }, + { + "epoch": 5.623253047873922, + "grad_norm": 1.230817198753357, + "learning_rate": 4.385255648038051e-06, + "loss": 0.3253, + "step": 2367 + }, + { + "epoch": 5.625631876300922, + "grad_norm": 1.4257118701934814, + "learning_rate": 4.382877526753864e-06, + "loss": 0.3173, + "step": 2368 + }, + { + "epoch": 5.628010704727922, + "grad_norm": 1.1575325727462769, + "learning_rate": 4.380499405469679e-06, + "loss": 0.3519, + "step": 2369 + }, + { + "epoch": 5.630389533154921, + "grad_norm": 1.1910418272018433, + "learning_rate": 4.378121284185494e-06, + "loss": 0.3166, + "step": 2370 + }, + { + "epoch": 5.632768361581921, + "grad_norm": 1.3160704374313354, + "learning_rate": 4.375743162901308e-06, + "loss": 0.3501, + "step": 2371 + }, + { + "epoch": 5.6351471900089205, + "grad_norm": 1.362200379371643, + "learning_rate": 4.373365041617123e-06, + "loss": 0.3489, + "step": 2372 + }, + { + "epoch": 5.637526018435921, + "grad_norm": 1.3136824369430542, + "learning_rate": 4.3709869203329375e-06, + "loss": 0.3795, + "step": 2373 + }, + { + "epoch": 5.63990484686292, + "grad_norm": 1.2438726425170898, + "learning_rate": 4.368608799048752e-06, + "loss": 0.3137, + "step": 2374 + }, + { + "epoch": 5.64228367528992, + "grad_norm": 1.3314287662506104, + "learning_rate": 4.366230677764567e-06, + "loss": 0.3524, + "step": 2375 + }, + { + "epoch": 5.644662503716919, + "grad_norm": 1.424486517906189, + "learning_rate": 4.3638525564803804e-06, + "loss": 0.476, + "step": 2376 + }, + { + "epoch": 5.647041332143919, + "grad_norm": 1.3244267702102661, + "learning_rate": 4.361474435196195e-06, + "loss": 0.3578, + "step": 2377 + }, + { + "epoch": 5.6494201605709184, + "grad_norm": 1.3369777202606201, + "learning_rate": 4.35909631391201e-06, + "loss": 0.3217, + "step": 2378 + }, + { + "epoch": 5.6517989889979185, + "grad_norm": 1.208483099937439, + "learning_rate": 4.356718192627824e-06, + "loss": 0.3472, + "step": 2379 + }, + { + "epoch": 5.654177817424918, + "grad_norm": 1.1892006397247314, + "learning_rate": 4.354340071343639e-06, + "loss": 0.3208, + "step": 2380 + }, + { + "epoch": 5.656556645851918, + "grad_norm": 1.340082049369812, + "learning_rate": 4.3519619500594535e-06, + "loss": 0.3779, + "step": 2381 + }, + { + "epoch": 5.658935474278918, + "grad_norm": 1.2777416706085205, + "learning_rate": 4.349583828775268e-06, + "loss": 0.3545, + "step": 2382 + }, + { + "epoch": 5.661314302705917, + "grad_norm": 1.3743726015090942, + "learning_rate": 4.347205707491082e-06, + "loss": 0.3735, + "step": 2383 + }, + { + "epoch": 5.663693131132917, + "grad_norm": 1.2272124290466309, + "learning_rate": 4.3448275862068965e-06, + "loss": 0.3372, + "step": 2384 + }, + { + "epoch": 5.6660719595599165, + "grad_norm": 1.2641527652740479, + "learning_rate": 4.342449464922712e-06, + "loss": 0.3637, + "step": 2385 + }, + { + "epoch": 5.668450787986917, + "grad_norm": 1.217805027961731, + "learning_rate": 4.340071343638526e-06, + "loss": 0.3269, + "step": 2386 + }, + { + "epoch": 5.670829616413916, + "grad_norm": 1.2050737142562866, + "learning_rate": 4.33769322235434e-06, + "loss": 0.3327, + "step": 2387 + }, + { + "epoch": 5.673208444840916, + "grad_norm": 1.241929292678833, + "learning_rate": 4.335315101070155e-06, + "loss": 0.358, + "step": 2388 + }, + { + "epoch": 5.675587273267915, + "grad_norm": 1.1699687242507935, + "learning_rate": 4.3329369797859695e-06, + "loss": 0.314, + "step": 2389 + }, + { + "epoch": 5.677966101694915, + "grad_norm": 1.3872138261795044, + "learning_rate": 4.330558858501784e-06, + "loss": 0.3469, + "step": 2390 + }, + { + "epoch": 5.680344930121915, + "grad_norm": 1.5021710395812988, + "learning_rate": 4.328180737217598e-06, + "loss": 0.2895, + "step": 2391 + }, + { + "epoch": 5.6827237585489145, + "grad_norm": 1.290337085723877, + "learning_rate": 4.325802615933413e-06, + "loss": 0.4113, + "step": 2392 + }, + { + "epoch": 5.685102586975915, + "grad_norm": 1.336408257484436, + "learning_rate": 4.323424494649228e-06, + "loss": 0.4131, + "step": 2393 + }, + { + "epoch": 5.687481415402914, + "grad_norm": 1.2749227285385132, + "learning_rate": 4.321046373365042e-06, + "loss": 0.3628, + "step": 2394 + }, + { + "epoch": 5.689860243829914, + "grad_norm": 1.6095540523529053, + "learning_rate": 4.318668252080856e-06, + "loss": 0.4793, + "step": 2395 + }, + { + "epoch": 5.692239072256913, + "grad_norm": 1.2831567525863647, + "learning_rate": 4.316290130796671e-06, + "loss": 0.3639, + "step": 2396 + }, + { + "epoch": 5.694617900683913, + "grad_norm": 1.3720487356185913, + "learning_rate": 4.313912009512486e-06, + "loss": 0.435, + "step": 2397 + }, + { + "epoch": 5.6969967291109125, + "grad_norm": 1.4367690086364746, + "learning_rate": 4.3115338882283e-06, + "loss": 0.33, + "step": 2398 + }, + { + "epoch": 5.699375557537913, + "grad_norm": 1.2887983322143555, + "learning_rate": 4.309155766944115e-06, + "loss": 0.2987, + "step": 2399 + }, + { + "epoch": 5.701754385964913, + "grad_norm": 1.349776268005371, + "learning_rate": 4.306777645659929e-06, + "loss": 0.3059, + "step": 2400 + }, + { + "epoch": 5.701754385964913, + "eval_loss": 0.42576730251312256, + "eval_runtime": 23.3621, + "eval_samples_per_second": 32.018, + "eval_steps_per_second": 16.009, + "step": 2400 + }, + { + "epoch": 5.704133214391912, + "grad_norm": 1.1741758584976196, + "learning_rate": 4.304399524375743e-06, + "loss": 0.3005, + "step": 2401 + }, + { + "epoch": 5.706512042818912, + "grad_norm": 1.4288971424102783, + "learning_rate": 4.302021403091558e-06, + "loss": 0.366, + "step": 2402 + }, + { + "epoch": 5.708890871245911, + "grad_norm": 1.223536491394043, + "learning_rate": 4.299643281807372e-06, + "loss": 0.418, + "step": 2403 + }, + { + "epoch": 5.711269699672911, + "grad_norm": 1.2036620378494263, + "learning_rate": 4.297265160523187e-06, + "loss": 0.3252, + "step": 2404 + }, + { + "epoch": 5.7136485280999105, + "grad_norm": 1.2744694948196411, + "learning_rate": 4.294887039239002e-06, + "loss": 0.3459, + "step": 2405 + }, + { + "epoch": 5.716027356526911, + "grad_norm": 1.2770841121673584, + "learning_rate": 4.292508917954816e-06, + "loss": 0.3132, + "step": 2406 + }, + { + "epoch": 5.71840618495391, + "grad_norm": 1.4111216068267822, + "learning_rate": 4.290130796670631e-06, + "loss": 0.3887, + "step": 2407 + }, + { + "epoch": 5.72078501338091, + "grad_norm": 1.5140353441238403, + "learning_rate": 4.2877526753864455e-06, + "loss": 0.407, + "step": 2408 + }, + { + "epoch": 5.72316384180791, + "grad_norm": 1.2891957759857178, + "learning_rate": 4.285374554102259e-06, + "loss": 0.3144, + "step": 2409 + }, + { + "epoch": 5.725542670234909, + "grad_norm": 1.3049012422561646, + "learning_rate": 4.282996432818074e-06, + "loss": 0.3894, + "step": 2410 + }, + { + "epoch": 5.727921498661909, + "grad_norm": 1.3873491287231445, + "learning_rate": 4.2806183115338884e-06, + "loss": 0.3407, + "step": 2411 + }, + { + "epoch": 5.730300327088909, + "grad_norm": 1.2798049449920654, + "learning_rate": 4.278240190249703e-06, + "loss": 0.3665, + "step": 2412 + }, + { + "epoch": 5.732679155515909, + "grad_norm": 1.3338556289672852, + "learning_rate": 4.275862068965518e-06, + "loss": 0.3259, + "step": 2413 + }, + { + "epoch": 5.735057983942908, + "grad_norm": 1.2759031057357788, + "learning_rate": 4.273483947681332e-06, + "loss": 0.3633, + "step": 2414 + }, + { + "epoch": 5.737436812369908, + "grad_norm": 1.341810703277588, + "learning_rate": 4.271105826397147e-06, + "loss": 0.3314, + "step": 2415 + }, + { + "epoch": 5.739815640796907, + "grad_norm": 1.2116082906723022, + "learning_rate": 4.2687277051129615e-06, + "loss": 0.3139, + "step": 2416 + }, + { + "epoch": 5.742194469223907, + "grad_norm": 1.2694543600082397, + "learning_rate": 4.266349583828775e-06, + "loss": 0.363, + "step": 2417 + }, + { + "epoch": 5.744573297650907, + "grad_norm": 1.3868086338043213, + "learning_rate": 4.26397146254459e-06, + "loss": 0.4053, + "step": 2418 + }, + { + "epoch": 5.746952126077907, + "grad_norm": 1.4398083686828613, + "learning_rate": 4.2615933412604045e-06, + "loss": 0.3422, + "step": 2419 + }, + { + "epoch": 5.749330954504907, + "grad_norm": 1.3065084218978882, + "learning_rate": 4.259215219976219e-06, + "loss": 0.3788, + "step": 2420 + }, + { + "epoch": 5.751709782931906, + "grad_norm": 1.4546451568603516, + "learning_rate": 4.256837098692034e-06, + "loss": 0.3491, + "step": 2421 + }, + { + "epoch": 5.754088611358906, + "grad_norm": 1.3563523292541504, + "learning_rate": 4.254458977407848e-06, + "loss": 0.3861, + "step": 2422 + }, + { + "epoch": 5.756467439785905, + "grad_norm": 1.161651849746704, + "learning_rate": 4.252080856123663e-06, + "loss": 0.2684, + "step": 2423 + }, + { + "epoch": 5.758846268212905, + "grad_norm": 1.226190447807312, + "learning_rate": 4.249702734839477e-06, + "loss": 0.343, + "step": 2424 + }, + { + "epoch": 5.761225096639905, + "grad_norm": 1.4148590564727783, + "learning_rate": 4.247324613555291e-06, + "loss": 0.368, + "step": 2425 + }, + { + "epoch": 5.763603925066905, + "grad_norm": 1.3697500228881836, + "learning_rate": 4.244946492271107e-06, + "loss": 0.3898, + "step": 2426 + }, + { + "epoch": 5.765982753493904, + "grad_norm": 1.273993730545044, + "learning_rate": 4.2425683709869205e-06, + "loss": 0.3559, + "step": 2427 + }, + { + "epoch": 5.768361581920904, + "grad_norm": 1.5440170764923096, + "learning_rate": 4.240190249702735e-06, + "loss": 0.3794, + "step": 2428 + }, + { + "epoch": 5.770740410347903, + "grad_norm": 1.2495598793029785, + "learning_rate": 4.23781212841855e-06, + "loss": 0.3359, + "step": 2429 + }, + { + "epoch": 5.773119238774903, + "grad_norm": 1.4670809507369995, + "learning_rate": 4.235434007134364e-06, + "loss": 0.4138, + "step": 2430 + }, + { + "epoch": 5.775498067201903, + "grad_norm": 1.6282352209091187, + "learning_rate": 4.233055885850179e-06, + "loss": 0.4163, + "step": 2431 + }, + { + "epoch": 5.777876895628903, + "grad_norm": 1.2993944883346558, + "learning_rate": 4.230677764565993e-06, + "loss": 0.3574, + "step": 2432 + }, + { + "epoch": 5.780255724055903, + "grad_norm": 1.5883249044418335, + "learning_rate": 4.228299643281808e-06, + "loss": 0.4197, + "step": 2433 + }, + { + "epoch": 5.782634552482902, + "grad_norm": 1.3945796489715576, + "learning_rate": 4.225921521997623e-06, + "loss": 0.3087, + "step": 2434 + }, + { + "epoch": 5.785013380909902, + "grad_norm": 1.2997937202453613, + "learning_rate": 4.2235434007134366e-06, + "loss": 0.391, + "step": 2435 + }, + { + "epoch": 5.787392209336901, + "grad_norm": 1.2544746398925781, + "learning_rate": 4.221165279429251e-06, + "loss": 0.3096, + "step": 2436 + }, + { + "epoch": 5.789771037763901, + "grad_norm": 1.1415493488311768, + "learning_rate": 4.218787158145066e-06, + "loss": 0.3369, + "step": 2437 + }, + { + "epoch": 5.7921498661909006, + "grad_norm": 1.385107159614563, + "learning_rate": 4.21640903686088e-06, + "loss": 0.3355, + "step": 2438 + }, + { + "epoch": 5.794528694617901, + "grad_norm": 1.3469212055206299, + "learning_rate": 4.214030915576694e-06, + "loss": 0.3616, + "step": 2439 + }, + { + "epoch": 5.796907523044901, + "grad_norm": 1.295502781867981, + "learning_rate": 4.21165279429251e-06, + "loss": 0.3785, + "step": 2440 + }, + { + "epoch": 5.7992863514719, + "grad_norm": 1.2500089406967163, + "learning_rate": 4.209274673008324e-06, + "loss": 0.3196, + "step": 2441 + }, + { + "epoch": 5.8016651798989, + "grad_norm": 1.3587085008621216, + "learning_rate": 4.206896551724138e-06, + "loss": 0.364, + "step": 2442 + }, + { + "epoch": 5.804044008325899, + "grad_norm": 1.3405495882034302, + "learning_rate": 4.204518430439953e-06, + "loss": 0.3083, + "step": 2443 + }, + { + "epoch": 5.806422836752899, + "grad_norm": 1.4187051057815552, + "learning_rate": 4.202140309155767e-06, + "loss": 0.4168, + "step": 2444 + }, + { + "epoch": 5.808801665179899, + "grad_norm": 1.4374973773956299, + "learning_rate": 4.199762187871582e-06, + "loss": 0.3902, + "step": 2445 + }, + { + "epoch": 5.811180493606899, + "grad_norm": 1.1398026943206787, + "learning_rate": 4.1973840665873964e-06, + "loss": 0.3392, + "step": 2446 + }, + { + "epoch": 5.813559322033898, + "grad_norm": 1.157288908958435, + "learning_rate": 4.195005945303211e-06, + "loss": 0.2963, + "step": 2447 + }, + { + "epoch": 5.815938150460898, + "grad_norm": 1.32456374168396, + "learning_rate": 4.192627824019026e-06, + "loss": 0.4377, + "step": 2448 + }, + { + "epoch": 5.818316978887898, + "grad_norm": 1.274467945098877, + "learning_rate": 4.19024970273484e-06, + "loss": 0.4056, + "step": 2449 + }, + { + "epoch": 5.820695807314897, + "grad_norm": 1.2734521627426147, + "learning_rate": 4.187871581450654e-06, + "loss": 0.3032, + "step": 2450 + }, + { + "epoch": 5.820695807314897, + "eval_loss": 0.42416471242904663, + "eval_runtime": 23.1412, + "eval_samples_per_second": 32.323, + "eval_steps_per_second": 16.162, + "step": 2450 + }, + { + "epoch": 5.823074635741897, + "grad_norm": 1.3587642908096313, + "learning_rate": 4.185493460166469e-06, + "loss": 0.3268, + "step": 2451 + }, + { + "epoch": 5.825453464168897, + "grad_norm": 1.193291187286377, + "learning_rate": 4.183115338882283e-06, + "loss": 0.32, + "step": 2452 + }, + { + "epoch": 5.827832292595897, + "grad_norm": 1.4782814979553223, + "learning_rate": 4.180737217598098e-06, + "loss": 0.3222, + "step": 2453 + }, + { + "epoch": 5.830211121022896, + "grad_norm": 1.1818342208862305, + "learning_rate": 4.1783590963139125e-06, + "loss": 0.315, + "step": 2454 + }, + { + "epoch": 5.832589949449896, + "grad_norm": 1.3878283500671387, + "learning_rate": 4.175980975029727e-06, + "loss": 0.3864, + "step": 2455 + }, + { + "epoch": 5.834968777876895, + "grad_norm": 1.251720905303955, + "learning_rate": 4.173602853745542e-06, + "loss": 0.4153, + "step": 2456 + }, + { + "epoch": 5.837347606303895, + "grad_norm": 1.4846504926681519, + "learning_rate": 4.1712247324613555e-06, + "loss": 0.3087, + "step": 2457 + }, + { + "epoch": 5.8397264347308955, + "grad_norm": 1.2438558340072632, + "learning_rate": 4.16884661117717e-06, + "loss": 0.3317, + "step": 2458 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 1.4153788089752197, + "learning_rate": 4.166468489892985e-06, + "loss": 0.4395, + "step": 2459 + }, + { + "epoch": 5.844484091584895, + "grad_norm": 1.2602993249893188, + "learning_rate": 4.164090368608799e-06, + "loss": 0.3906, + "step": 2460 + }, + { + "epoch": 5.846862920011894, + "grad_norm": 1.2313951253890991, + "learning_rate": 4.161712247324614e-06, + "loss": 0.3555, + "step": 2461 + }, + { + "epoch": 5.849241748438894, + "grad_norm": 1.2999407052993774, + "learning_rate": 4.1593341260404285e-06, + "loss": 0.3598, + "step": 2462 + }, + { + "epoch": 5.851620576865893, + "grad_norm": 1.2809889316558838, + "learning_rate": 4.156956004756243e-06, + "loss": 0.3415, + "step": 2463 + }, + { + "epoch": 5.853999405292893, + "grad_norm": 1.3394689559936523, + "learning_rate": 4.154577883472058e-06, + "loss": 0.3874, + "step": 2464 + }, + { + "epoch": 5.856378233719893, + "grad_norm": 1.3208613395690918, + "learning_rate": 4.1521997621878715e-06, + "loss": 0.322, + "step": 2465 + }, + { + "epoch": 5.858757062146893, + "grad_norm": 1.2372848987579346, + "learning_rate": 4.149821640903686e-06, + "loss": 0.2968, + "step": 2466 + }, + { + "epoch": 5.861135890573893, + "grad_norm": 1.315977692604065, + "learning_rate": 4.1474435196195016e-06, + "loss": 0.3143, + "step": 2467 + }, + { + "epoch": 5.863514719000892, + "grad_norm": 1.2617524862289429, + "learning_rate": 4.145065398335315e-06, + "loss": 0.3503, + "step": 2468 + }, + { + "epoch": 5.865893547427892, + "grad_norm": 1.297440528869629, + "learning_rate": 4.14268727705113e-06, + "loss": 0.3145, + "step": 2469 + }, + { + "epoch": 5.868272375854891, + "grad_norm": 1.3299028873443604, + "learning_rate": 4.1403091557669446e-06, + "loss": 0.4128, + "step": 2470 + }, + { + "epoch": 5.8706512042818915, + "grad_norm": 1.227639079093933, + "learning_rate": 4.137931034482759e-06, + "loss": 0.3194, + "step": 2471 + }, + { + "epoch": 5.873030032708891, + "grad_norm": 1.3324947357177734, + "learning_rate": 4.135552913198573e-06, + "loss": 0.366, + "step": 2472 + }, + { + "epoch": 5.875408861135891, + "grad_norm": 1.3640031814575195, + "learning_rate": 4.1331747919143875e-06, + "loss": 0.3805, + "step": 2473 + }, + { + "epoch": 5.87778768956289, + "grad_norm": 1.2263596057891846, + "learning_rate": 4.130796670630203e-06, + "loss": 0.3426, + "step": 2474 + }, + { + "epoch": 5.88016651798989, + "grad_norm": 1.214850902557373, + "learning_rate": 4.128418549346017e-06, + "loss": 0.287, + "step": 2475 + }, + { + "epoch": 5.882545346416889, + "grad_norm": 1.247874140739441, + "learning_rate": 4.126040428061831e-06, + "loss": 0.3614, + "step": 2476 + }, + { + "epoch": 5.884924174843889, + "grad_norm": 1.407050371170044, + "learning_rate": 4.123662306777646e-06, + "loss": 0.3283, + "step": 2477 + }, + { + "epoch": 5.887303003270889, + "grad_norm": 1.4140002727508545, + "learning_rate": 4.121284185493461e-06, + "loss": 0.3561, + "step": 2478 + }, + { + "epoch": 5.889681831697889, + "grad_norm": 1.2350397109985352, + "learning_rate": 4.118906064209275e-06, + "loss": 0.3471, + "step": 2479 + }, + { + "epoch": 5.892060660124889, + "grad_norm": 1.4300284385681152, + "learning_rate": 4.116527942925089e-06, + "loss": 0.3493, + "step": 2480 + }, + { + "epoch": 5.894439488551888, + "grad_norm": 1.4063663482666016, + "learning_rate": 4.1141498216409044e-06, + "loss": 0.4132, + "step": 2481 + }, + { + "epoch": 5.896818316978888, + "grad_norm": 1.3661792278289795, + "learning_rate": 4.111771700356719e-06, + "loss": 0.304, + "step": 2482 + }, + { + "epoch": 5.899197145405887, + "grad_norm": 1.1985987424850464, + "learning_rate": 4.109393579072533e-06, + "loss": 0.3254, + "step": 2483 + }, + { + "epoch": 5.9015759738328875, + "grad_norm": 1.4380401372909546, + "learning_rate": 4.107015457788347e-06, + "loss": 0.3286, + "step": 2484 + }, + { + "epoch": 5.903954802259887, + "grad_norm": 1.3231626749038696, + "learning_rate": 4.104637336504162e-06, + "loss": 0.3069, + "step": 2485 + }, + { + "epoch": 5.906333630686887, + "grad_norm": 1.235176682472229, + "learning_rate": 4.102259215219977e-06, + "loss": 0.3309, + "step": 2486 + }, + { + "epoch": 5.908712459113886, + "grad_norm": 1.6012660264968872, + "learning_rate": 4.09988109393579e-06, + "loss": 0.4486, + "step": 2487 + }, + { + "epoch": 5.911091287540886, + "grad_norm": 1.4298604726791382, + "learning_rate": 4.097502972651606e-06, + "loss": 0.3915, + "step": 2488 + }, + { + "epoch": 5.913470115967886, + "grad_norm": 1.305324673652649, + "learning_rate": 4.0951248513674205e-06, + "loss": 0.3546, + "step": 2489 + }, + { + "epoch": 5.915848944394885, + "grad_norm": 1.1478954553604126, + "learning_rate": 4.092746730083234e-06, + "loss": 0.2889, + "step": 2490 + }, + { + "epoch": 5.9182277728218855, + "grad_norm": 1.3921252489089966, + "learning_rate": 4.090368608799049e-06, + "loss": 0.3828, + "step": 2491 + }, + { + "epoch": 5.920606601248885, + "grad_norm": 1.2712653875350952, + "learning_rate": 4.0879904875148635e-06, + "loss": 0.3378, + "step": 2492 + }, + { + "epoch": 5.922985429675885, + "grad_norm": 1.5039105415344238, + "learning_rate": 4.085612366230678e-06, + "loss": 0.3589, + "step": 2493 + }, + { + "epoch": 5.925364258102884, + "grad_norm": 1.4243011474609375, + "learning_rate": 4.083234244946493e-06, + "loss": 0.4224, + "step": 2494 + }, + { + "epoch": 5.927743086529884, + "grad_norm": 1.326659083366394, + "learning_rate": 4.080856123662307e-06, + "loss": 0.393, + "step": 2495 + }, + { + "epoch": 5.930121914956883, + "grad_norm": 1.313839077949524, + "learning_rate": 4.078478002378122e-06, + "loss": 0.2838, + "step": 2496 + }, + { + "epoch": 5.9325007433838834, + "grad_norm": 1.2728077173233032, + "learning_rate": 4.0760998810939365e-06, + "loss": 0.3546, + "step": 2497 + }, + { + "epoch": 5.9348795718108835, + "grad_norm": 1.5198734998703003, + "learning_rate": 4.07372175980975e-06, + "loss": 0.4474, + "step": 2498 + }, + { + "epoch": 5.937258400237883, + "grad_norm": 1.4003756046295166, + "learning_rate": 4.071343638525565e-06, + "loss": 0.3456, + "step": 2499 + }, + { + "epoch": 5.939637228664883, + "grad_norm": 1.1408597230911255, + "learning_rate": 4.0689655172413795e-06, + "loss": 0.3674, + "step": 2500 + }, + { + "epoch": 5.939637228664883, + "eval_loss": 0.4224425256252289, + "eval_runtime": 23.0539, + "eval_samples_per_second": 32.446, + "eval_steps_per_second": 16.223, + "step": 2500 + }, + { + "epoch": 5.942016057091882, + "grad_norm": 1.2138983011245728, + "learning_rate": 4.066587395957194e-06, + "loss": 0.3375, + "step": 2501 + }, + { + "epoch": 5.944394885518882, + "grad_norm": 1.35971200466156, + "learning_rate": 4.064209274673009e-06, + "loss": 0.3519, + "step": 2502 + }, + { + "epoch": 5.946773713945881, + "grad_norm": 1.268079400062561, + "learning_rate": 4.061831153388823e-06, + "loss": 0.3048, + "step": 2503 + }, + { + "epoch": 5.9491525423728815, + "grad_norm": 1.1439648866653442, + "learning_rate": 4.059453032104638e-06, + "loss": 0.351, + "step": 2504 + }, + { + "epoch": 5.951531370799881, + "grad_norm": 1.3303587436676025, + "learning_rate": 4.0570749108204525e-06, + "loss": 0.3799, + "step": 2505 + }, + { + "epoch": 5.953910199226881, + "grad_norm": 1.3696061372756958, + "learning_rate": 4.054696789536266e-06, + "loss": 0.4025, + "step": 2506 + }, + { + "epoch": 5.956289027653881, + "grad_norm": 1.2975456714630127, + "learning_rate": 4.052318668252081e-06, + "loss": 0.3972, + "step": 2507 + }, + { + "epoch": 5.95866785608088, + "grad_norm": 1.4050956964492798, + "learning_rate": 4.049940546967896e-06, + "loss": 0.3483, + "step": 2508 + }, + { + "epoch": 5.96104668450788, + "grad_norm": 1.3199036121368408, + "learning_rate": 4.04756242568371e-06, + "loss": 0.3436, + "step": 2509 + }, + { + "epoch": 5.963425512934879, + "grad_norm": 1.2577695846557617, + "learning_rate": 4.045184304399525e-06, + "loss": 0.3858, + "step": 2510 + }, + { + "epoch": 5.9658043413618795, + "grad_norm": 1.2527928352355957, + "learning_rate": 4.042806183115339e-06, + "loss": 0.4359, + "step": 2511 + }, + { + "epoch": 5.968183169788879, + "grad_norm": 1.3702149391174316, + "learning_rate": 4.040428061831154e-06, + "loss": 0.3662, + "step": 2512 + }, + { + "epoch": 5.970561998215879, + "grad_norm": 1.311537504196167, + "learning_rate": 4.038049940546968e-06, + "loss": 0.3304, + "step": 2513 + }, + { + "epoch": 5.972940826642878, + "grad_norm": 1.3257426023483276, + "learning_rate": 4.035671819262782e-06, + "loss": 0.3723, + "step": 2514 + }, + { + "epoch": 5.975319655069878, + "grad_norm": 1.386452317237854, + "learning_rate": 4.033293697978598e-06, + "loss": 0.322, + "step": 2515 + }, + { + "epoch": 5.977698483496878, + "grad_norm": 1.3452880382537842, + "learning_rate": 4.0309155766944116e-06, + "loss": 0.3341, + "step": 2516 + }, + { + "epoch": 5.9800773119238775, + "grad_norm": 1.3027344942092896, + "learning_rate": 4.028537455410226e-06, + "loss": 0.3183, + "step": 2517 + }, + { + "epoch": 5.982456140350877, + "grad_norm": 1.2878599166870117, + "learning_rate": 4.026159334126041e-06, + "loss": 0.3452, + "step": 2518 + }, + { + "epoch": 5.984834968777877, + "grad_norm": 1.4081708192825317, + "learning_rate": 4.023781212841855e-06, + "loss": 0.3787, + "step": 2519 + }, + { + "epoch": 5.987213797204877, + "grad_norm": 1.4395925998687744, + "learning_rate": 4.02140309155767e-06, + "loss": 0.3288, + "step": 2520 + }, + { + "epoch": 5.989592625631876, + "grad_norm": 1.3262885808944702, + "learning_rate": 4.019024970273484e-06, + "loss": 0.357, + "step": 2521 + }, + { + "epoch": 5.991971454058876, + "grad_norm": 1.5246473550796509, + "learning_rate": 4.016646848989299e-06, + "loss": 0.318, + "step": 2522 + }, + { + "epoch": 5.994350282485875, + "grad_norm": 1.4919283390045166, + "learning_rate": 4.014268727705114e-06, + "loss": 0.3573, + "step": 2523 + }, + { + "epoch": 5.9967291109128755, + "grad_norm": 1.4421066045761108, + "learning_rate": 4.011890606420928e-06, + "loss": 0.3735, + "step": 2524 + }, + { + "epoch": 5.999107939339875, + "grad_norm": 1.316329836845398, + "learning_rate": 4.009512485136742e-06, + "loss": 0.3399, + "step": 2525 + }, + { + "epoch": 6.0, + "grad_norm": 2.254108190536499, + "learning_rate": 4.007134363852557e-06, + "loss": 0.497, + "step": 2526 + }, + { + "epoch": 6.002378828427, + "grad_norm": 1.212050437927246, + "learning_rate": 4.0047562425683714e-06, + "loss": 0.3309, + "step": 2527 + }, + { + "epoch": 6.004757656853999, + "grad_norm": 1.3998914957046509, + "learning_rate": 4.002378121284185e-06, + "loss": 0.3646, + "step": 2528 + }, + { + "epoch": 6.007136485280999, + "grad_norm": 1.3298025131225586, + "learning_rate": 4.000000000000001e-06, + "loss": 0.3493, + "step": 2529 + }, + { + "epoch": 6.009515313707999, + "grad_norm": 1.5306940078735352, + "learning_rate": 3.997621878715815e-06, + "loss": 0.3512, + "step": 2530 + }, + { + "epoch": 6.011894142134999, + "grad_norm": 1.2071857452392578, + "learning_rate": 3.995243757431629e-06, + "loss": 0.3718, + "step": 2531 + }, + { + "epoch": 6.014272970561998, + "grad_norm": 1.308465838432312, + "learning_rate": 3.992865636147444e-06, + "loss": 0.4081, + "step": 2532 + }, + { + "epoch": 6.016651798988998, + "grad_norm": 1.4098211526870728, + "learning_rate": 3.990487514863258e-06, + "loss": 0.3549, + "step": 2533 + }, + { + "epoch": 6.019030627415997, + "grad_norm": 1.3329410552978516, + "learning_rate": 3.988109393579073e-06, + "loss": 0.3299, + "step": 2534 + }, + { + "epoch": 6.021409455842997, + "grad_norm": 1.2828419208526611, + "learning_rate": 3.9857312722948875e-06, + "loss": 0.3701, + "step": 2535 + }, + { + "epoch": 6.0237882842699975, + "grad_norm": 1.220705270767212, + "learning_rate": 3.983353151010702e-06, + "loss": 0.2835, + "step": 2536 + }, + { + "epoch": 6.026167112696997, + "grad_norm": 1.2435150146484375, + "learning_rate": 3.980975029726517e-06, + "loss": 0.3386, + "step": 2537 + }, + { + "epoch": 6.028545941123997, + "grad_norm": 1.1658509969711304, + "learning_rate": 3.978596908442331e-06, + "loss": 0.3134, + "step": 2538 + }, + { + "epoch": 6.030924769550996, + "grad_norm": 1.1792610883712769, + "learning_rate": 3.976218787158145e-06, + "loss": 0.3513, + "step": 2539 + }, + { + "epoch": 6.033303597977996, + "grad_norm": 1.215883493423462, + "learning_rate": 3.97384066587396e-06, + "loss": 0.3231, + "step": 2540 + }, + { + "epoch": 6.035682426404995, + "grad_norm": 1.3747514486312866, + "learning_rate": 3.971462544589774e-06, + "loss": 0.3999, + "step": 2541 + }, + { + "epoch": 6.038061254831995, + "grad_norm": 1.22146737575531, + "learning_rate": 3.969084423305589e-06, + "loss": 0.3099, + "step": 2542 + }, + { + "epoch": 6.040440083258995, + "grad_norm": 1.3517014980316162, + "learning_rate": 3.9667063020214035e-06, + "loss": 0.3279, + "step": 2543 + }, + { + "epoch": 6.042818911685995, + "grad_norm": 1.529627799987793, + "learning_rate": 3.964328180737218e-06, + "loss": 0.3727, + "step": 2544 + }, + { + "epoch": 6.045197740112994, + "grad_norm": 1.1970479488372803, + "learning_rate": 3.961950059453033e-06, + "loss": 0.2882, + "step": 2545 + }, + { + "epoch": 6.047576568539994, + "grad_norm": 1.472764253616333, + "learning_rate": 3.9595719381688465e-06, + "loss": 0.4079, + "step": 2546 + }, + { + "epoch": 6.049955396966994, + "grad_norm": 1.3406745195388794, + "learning_rate": 3.957193816884661e-06, + "loss": 0.331, + "step": 2547 + }, + { + "epoch": 6.052334225393993, + "grad_norm": 1.3253549337387085, + "learning_rate": 3.954815695600476e-06, + "loss": 0.3248, + "step": 2548 + }, + { + "epoch": 6.0547130538209935, + "grad_norm": 1.300209403038025, + "learning_rate": 3.95243757431629e-06, + "loss": 0.3748, + "step": 2549 + }, + { + "epoch": 6.057091882247993, + "grad_norm": 1.123270034790039, + "learning_rate": 3.950059453032105e-06, + "loss": 0.2615, + "step": 2550 + }, + { + "epoch": 6.057091882247993, + "eval_loss": 0.4250085949897766, + "eval_runtime": 23.6747, + "eval_samples_per_second": 31.595, + "eval_steps_per_second": 15.797, + "step": 2550 + }, + { + "epoch": 6.059470710674993, + "grad_norm": 1.2300753593444824, + "learning_rate": 3.9476813317479196e-06, + "loss": 0.3242, + "step": 2551 + }, + { + "epoch": 6.061849539101992, + "grad_norm": 1.5784087181091309, + "learning_rate": 3.945303210463734e-06, + "loss": 0.314, + "step": 2552 + }, + { + "epoch": 6.064228367528992, + "grad_norm": 1.2567307949066162, + "learning_rate": 3.942925089179549e-06, + "loss": 0.2707, + "step": 2553 + }, + { + "epoch": 6.066607195955991, + "grad_norm": 1.3432345390319824, + "learning_rate": 3.9405469678953626e-06, + "loss": 0.3358, + "step": 2554 + }, + { + "epoch": 6.068986024382991, + "grad_norm": 1.259826898574829, + "learning_rate": 3.938168846611177e-06, + "loss": 0.2677, + "step": 2555 + }, + { + "epoch": 6.0713648528099915, + "grad_norm": 1.553199291229248, + "learning_rate": 3.935790725326993e-06, + "loss": 0.4146, + "step": 2556 + }, + { + "epoch": 6.073743681236991, + "grad_norm": 1.6028739213943481, + "learning_rate": 3.933412604042806e-06, + "loss": 0.3783, + "step": 2557 + }, + { + "epoch": 6.076122509663991, + "grad_norm": 1.1377569437026978, + "learning_rate": 3.931034482758621e-06, + "loss": 0.2892, + "step": 2558 + }, + { + "epoch": 6.07850133809099, + "grad_norm": 1.459771990776062, + "learning_rate": 3.928656361474436e-06, + "loss": 0.402, + "step": 2559 + }, + { + "epoch": 6.08088016651799, + "grad_norm": 1.3900012969970703, + "learning_rate": 3.92627824019025e-06, + "loss": 0.3713, + "step": 2560 + }, + { + "epoch": 6.083258994944989, + "grad_norm": 1.275058627128601, + "learning_rate": 3.923900118906064e-06, + "loss": 0.2822, + "step": 2561 + }, + { + "epoch": 6.085637823371989, + "grad_norm": 1.4702314138412476, + "learning_rate": 3.921521997621879e-06, + "loss": 0.3744, + "step": 2562 + }, + { + "epoch": 6.088016651798989, + "grad_norm": 1.2283591032028198, + "learning_rate": 3.919143876337694e-06, + "loss": 0.3331, + "step": 2563 + }, + { + "epoch": 6.090395480225989, + "grad_norm": 1.2543374300003052, + "learning_rate": 3.916765755053508e-06, + "loss": 0.2978, + "step": 2564 + }, + { + "epoch": 6.092774308652988, + "grad_norm": 1.4499620199203491, + "learning_rate": 3.914387633769322e-06, + "loss": 0.2813, + "step": 2565 + }, + { + "epoch": 6.095153137079988, + "grad_norm": 1.3508583307266235, + "learning_rate": 3.912009512485137e-06, + "loss": 0.4071, + "step": 2566 + }, + { + "epoch": 6.097531965506988, + "grad_norm": 1.4521112442016602, + "learning_rate": 3.909631391200952e-06, + "loss": 0.3496, + "step": 2567 + }, + { + "epoch": 6.099910793933987, + "grad_norm": 1.3008768558502197, + "learning_rate": 3.907253269916766e-06, + "loss": 0.3175, + "step": 2568 + }, + { + "epoch": 6.1022896223609875, + "grad_norm": 1.4323439598083496, + "learning_rate": 3.90487514863258e-06, + "loss": 0.3982, + "step": 2569 + }, + { + "epoch": 6.104668450787987, + "grad_norm": 1.166544795036316, + "learning_rate": 3.9024970273483955e-06, + "loss": 0.3144, + "step": 2570 + }, + { + "epoch": 6.107047279214987, + "grad_norm": 1.3104971647262573, + "learning_rate": 3.90011890606421e-06, + "loss": 0.3622, + "step": 2571 + }, + { + "epoch": 6.109426107641986, + "grad_norm": 1.5338175296783447, + "learning_rate": 3.897740784780024e-06, + "loss": 0.371, + "step": 2572 + }, + { + "epoch": 6.111804936068986, + "grad_norm": 1.3089890480041504, + "learning_rate": 3.8953626634958385e-06, + "loss": 0.3373, + "step": 2573 + }, + { + "epoch": 6.114183764495985, + "grad_norm": 1.1850541830062866, + "learning_rate": 3.892984542211653e-06, + "loss": 0.2219, + "step": 2574 + }, + { + "epoch": 6.116562592922985, + "grad_norm": 1.349560260772705, + "learning_rate": 3.890606420927468e-06, + "loss": 0.3315, + "step": 2575 + }, + { + "epoch": 6.1189414213499855, + "grad_norm": 1.3233453035354614, + "learning_rate": 3.8882282996432814e-06, + "loss": 0.3875, + "step": 2576 + }, + { + "epoch": 6.121320249776985, + "grad_norm": 1.5820964574813843, + "learning_rate": 3.885850178359097e-06, + "loss": 0.3972, + "step": 2577 + }, + { + "epoch": 6.123699078203985, + "grad_norm": 1.3878401517868042, + "learning_rate": 3.8834720570749115e-06, + "loss": 0.381, + "step": 2578 + }, + { + "epoch": 6.126077906630984, + "grad_norm": 1.4216152429580688, + "learning_rate": 3.881093935790725e-06, + "loss": 0.3462, + "step": 2579 + }, + { + "epoch": 6.128456735057984, + "grad_norm": 1.3979926109313965, + "learning_rate": 3.87871581450654e-06, + "loss": 0.3849, + "step": 2580 + }, + { + "epoch": 6.130835563484983, + "grad_norm": 1.2560980319976807, + "learning_rate": 3.8763376932223545e-06, + "loss": 0.3653, + "step": 2581 + }, + { + "epoch": 6.1332143919119835, + "grad_norm": 1.2554693222045898, + "learning_rate": 3.873959571938169e-06, + "loss": 0.362, + "step": 2582 + }, + { + "epoch": 6.135593220338983, + "grad_norm": 1.087634801864624, + "learning_rate": 3.871581450653984e-06, + "loss": 0.2916, + "step": 2583 + }, + { + "epoch": 6.137972048765983, + "grad_norm": 1.3429555892944336, + "learning_rate": 3.869203329369798e-06, + "loss": 0.4075, + "step": 2584 + }, + { + "epoch": 6.140350877192983, + "grad_norm": 1.396567940711975, + "learning_rate": 3.866825208085613e-06, + "loss": 0.3967, + "step": 2585 + }, + { + "epoch": 6.142729705619982, + "grad_norm": 1.3268336057662964, + "learning_rate": 3.8644470868014276e-06, + "loss": 0.3108, + "step": 2586 + }, + { + "epoch": 6.145108534046982, + "grad_norm": 1.2219895124435425, + "learning_rate": 3.862068965517241e-06, + "loss": 0.2972, + "step": 2587 + }, + { + "epoch": 6.147487362473981, + "grad_norm": 1.2738436460494995, + "learning_rate": 3.859690844233056e-06, + "loss": 0.3827, + "step": 2588 + }, + { + "epoch": 6.1498661909009815, + "grad_norm": 1.2809815406799316, + "learning_rate": 3.8573127229488705e-06, + "loss": 0.29, + "step": 2589 + }, + { + "epoch": 6.152245019327981, + "grad_norm": 1.4483623504638672, + "learning_rate": 3.854934601664685e-06, + "loss": 0.3591, + "step": 2590 + }, + { + "epoch": 6.154623847754981, + "grad_norm": 1.3743950128555298, + "learning_rate": 3.8525564803805e-06, + "loss": 0.3304, + "step": 2591 + }, + { + "epoch": 6.15700267618198, + "grad_norm": 1.2952258586883545, + "learning_rate": 3.850178359096314e-06, + "loss": 0.331, + "step": 2592 + }, + { + "epoch": 6.15938150460898, + "grad_norm": 1.3521857261657715, + "learning_rate": 3.847800237812129e-06, + "loss": 0.3592, + "step": 2593 + }, + { + "epoch": 6.161760333035979, + "grad_norm": 1.349928617477417, + "learning_rate": 3.845422116527943e-06, + "loss": 0.3681, + "step": 2594 + }, + { + "epoch": 6.1641391614629795, + "grad_norm": 1.2646950483322144, + "learning_rate": 3.843043995243757e-06, + "loss": 0.283, + "step": 2595 + }, + { + "epoch": 6.16651798988998, + "grad_norm": 1.3562172651290894, + "learning_rate": 3.840665873959572e-06, + "loss": 0.2906, + "step": 2596 + }, + { + "epoch": 6.168896818316979, + "grad_norm": 1.2698155641555786, + "learning_rate": 3.838287752675387e-06, + "loss": 0.2687, + "step": 2597 + }, + { + "epoch": 6.171275646743979, + "grad_norm": 1.2567254304885864, + "learning_rate": 3.835909631391201e-06, + "loss": 0.3129, + "step": 2598 + }, + { + "epoch": 6.173654475170978, + "grad_norm": 1.3541849851608276, + "learning_rate": 3.833531510107016e-06, + "loss": 0.3748, + "step": 2599 + }, + { + "epoch": 6.176033303597978, + "grad_norm": 1.405503511428833, + "learning_rate": 3.83115338882283e-06, + "loss": 0.3913, + "step": 2600 + }, + { + "epoch": 6.176033303597978, + "eval_loss": 0.4246251583099365, + "eval_runtime": 22.9538, + "eval_samples_per_second": 32.587, + "eval_steps_per_second": 16.294, + "step": 2600 + }, + { + "epoch": 6.178412132024977, + "grad_norm": 1.399099349975586, + "learning_rate": 3.828775267538645e-06, + "loss": 0.4002, + "step": 2601 + }, + { + "epoch": 6.1807909604519775, + "grad_norm": 1.262243390083313, + "learning_rate": 3.826397146254459e-06, + "loss": 0.2581, + "step": 2602 + }, + { + "epoch": 6.183169788878977, + "grad_norm": 1.206834316253662, + "learning_rate": 3.824019024970273e-06, + "loss": 0.3479, + "step": 2603 + }, + { + "epoch": 6.185548617305977, + "grad_norm": 1.337079405784607, + "learning_rate": 3.821640903686089e-06, + "loss": 0.3703, + "step": 2604 + }, + { + "epoch": 6.187927445732977, + "grad_norm": 1.2376940250396729, + "learning_rate": 3.819262782401903e-06, + "loss": 0.2936, + "step": 2605 + }, + { + "epoch": 6.190306274159976, + "grad_norm": 1.377694010734558, + "learning_rate": 3.816884661117717e-06, + "loss": 0.3371, + "step": 2606 + }, + { + "epoch": 6.192685102586976, + "grad_norm": 1.3850666284561157, + "learning_rate": 3.814506539833532e-06, + "loss": 0.442, + "step": 2607 + }, + { + "epoch": 6.1950639310139755, + "grad_norm": 1.4156010150909424, + "learning_rate": 3.812128418549346e-06, + "loss": 0.3491, + "step": 2608 + }, + { + "epoch": 6.197442759440976, + "grad_norm": 1.4659996032714844, + "learning_rate": 3.8097502972651606e-06, + "loss": 0.3751, + "step": 2609 + }, + { + "epoch": 6.199821587867975, + "grad_norm": 1.3102697134017944, + "learning_rate": 3.8073721759809753e-06, + "loss": 0.3031, + "step": 2610 + }, + { + "epoch": 6.202200416294975, + "grad_norm": 1.2564831972122192, + "learning_rate": 3.80499405469679e-06, + "loss": 0.3229, + "step": 2611 + }, + { + "epoch": 6.204579244721974, + "grad_norm": 1.4121432304382324, + "learning_rate": 3.8026159334126045e-06, + "loss": 0.3547, + "step": 2612 + }, + { + "epoch": 6.206958073148974, + "grad_norm": 1.3601089715957642, + "learning_rate": 3.800237812128419e-06, + "loss": 0.3495, + "step": 2613 + }, + { + "epoch": 6.209336901575973, + "grad_norm": 1.2676310539245605, + "learning_rate": 3.7978596908442333e-06, + "loss": 0.2814, + "step": 2614 + }, + { + "epoch": 6.2117157300029735, + "grad_norm": 1.5988956689834595, + "learning_rate": 3.795481569560048e-06, + "loss": 0.3841, + "step": 2615 + }, + { + "epoch": 6.214094558429974, + "grad_norm": 1.3224798440933228, + "learning_rate": 3.793103448275862e-06, + "loss": 0.3374, + "step": 2616 + }, + { + "epoch": 6.216473386856973, + "grad_norm": 1.3553344011306763, + "learning_rate": 3.7907253269916767e-06, + "loss": 0.3749, + "step": 2617 + }, + { + "epoch": 6.218852215283973, + "grad_norm": 1.381098747253418, + "learning_rate": 3.7883472057074917e-06, + "loss": 0.3868, + "step": 2618 + }, + { + "epoch": 6.221231043710972, + "grad_norm": 1.4670438766479492, + "learning_rate": 3.785969084423306e-06, + "loss": 0.3283, + "step": 2619 + }, + { + "epoch": 6.223609872137972, + "grad_norm": 1.5048755407333374, + "learning_rate": 3.7835909631391205e-06, + "loss": 0.4414, + "step": 2620 + }, + { + "epoch": 6.2259887005649714, + "grad_norm": 1.3907883167266846, + "learning_rate": 3.7812128418549347e-06, + "loss": 0.3416, + "step": 2621 + }, + { + "epoch": 6.2283675289919715, + "grad_norm": 1.4164788722991943, + "learning_rate": 3.7788347205707493e-06, + "loss": 0.3431, + "step": 2622 + }, + { + "epoch": 6.230746357418971, + "grad_norm": 1.3779408931732178, + "learning_rate": 3.776456599286564e-06, + "loss": 0.3038, + "step": 2623 + }, + { + "epoch": 6.233125185845971, + "grad_norm": 1.319889783859253, + "learning_rate": 3.7740784780023785e-06, + "loss": 0.2998, + "step": 2624 + }, + { + "epoch": 6.235504014272971, + "grad_norm": 1.335462212562561, + "learning_rate": 3.771700356718193e-06, + "loss": 0.3827, + "step": 2625 + }, + { + "epoch": 6.23788284269997, + "grad_norm": 1.2803829908370972, + "learning_rate": 3.7693222354340078e-06, + "loss": 0.3158, + "step": 2626 + }, + { + "epoch": 6.24026167112697, + "grad_norm": 1.4062762260437012, + "learning_rate": 3.766944114149822e-06, + "loss": 0.3213, + "step": 2627 + }, + { + "epoch": 6.2426404995539695, + "grad_norm": 1.5046403408050537, + "learning_rate": 3.7645659928656366e-06, + "loss": 0.3908, + "step": 2628 + }, + { + "epoch": 6.24501932798097, + "grad_norm": 1.492322325706482, + "learning_rate": 3.7621878715814507e-06, + "loss": 0.411, + "step": 2629 + }, + { + "epoch": 6.247398156407969, + "grad_norm": 1.3617651462554932, + "learning_rate": 3.7598097502972654e-06, + "loss": 0.2895, + "step": 2630 + }, + { + "epoch": 6.249776984834969, + "grad_norm": 1.348213791847229, + "learning_rate": 3.7574316290130804e-06, + "loss": 0.3627, + "step": 2631 + }, + { + "epoch": 6.252155813261968, + "grad_norm": 1.5443631410598755, + "learning_rate": 3.7550535077288946e-06, + "loss": 0.3227, + "step": 2632 + }, + { + "epoch": 6.254534641688968, + "grad_norm": 1.4760981798171997, + "learning_rate": 3.752675386444709e-06, + "loss": 0.3428, + "step": 2633 + }, + { + "epoch": 6.256913470115968, + "grad_norm": 1.4275660514831543, + "learning_rate": 3.7502972651605234e-06, + "loss": 0.3446, + "step": 2634 + }, + { + "epoch": 6.2592922985429675, + "grad_norm": 1.2849643230438232, + "learning_rate": 3.747919143876338e-06, + "loss": 0.2954, + "step": 2635 + }, + { + "epoch": 6.261671126969968, + "grad_norm": 1.335081696510315, + "learning_rate": 3.745541022592152e-06, + "loss": 0.3039, + "step": 2636 + }, + { + "epoch": 6.264049955396967, + "grad_norm": 1.3974826335906982, + "learning_rate": 3.7431629013079668e-06, + "loss": 0.3161, + "step": 2637 + }, + { + "epoch": 6.266428783823967, + "grad_norm": 1.3891911506652832, + "learning_rate": 3.740784780023782e-06, + "loss": 0.3523, + "step": 2638 + }, + { + "epoch": 6.268807612250966, + "grad_norm": 1.435065507888794, + "learning_rate": 3.738406658739596e-06, + "loss": 0.3654, + "step": 2639 + }, + { + "epoch": 6.271186440677966, + "grad_norm": 1.2375218868255615, + "learning_rate": 3.7360285374554106e-06, + "loss": 0.2888, + "step": 2640 + }, + { + "epoch": 6.2735652691049655, + "grad_norm": 1.3355069160461426, + "learning_rate": 3.7336504161712252e-06, + "loss": 0.3153, + "step": 2641 + }, + { + "epoch": 6.275944097531966, + "grad_norm": 1.5530922412872314, + "learning_rate": 3.7312722948870394e-06, + "loss": 0.4463, + "step": 2642 + }, + { + "epoch": 6.278322925958965, + "grad_norm": 1.516356348991394, + "learning_rate": 3.728894173602854e-06, + "loss": 0.3434, + "step": 2643 + }, + { + "epoch": 6.280701754385965, + "grad_norm": 1.324694037437439, + "learning_rate": 3.7265160523186682e-06, + "loss": 0.3458, + "step": 2644 + }, + { + "epoch": 6.283080582812965, + "grad_norm": 1.344570279121399, + "learning_rate": 3.7241379310344832e-06, + "loss": 0.31, + "step": 2645 + }, + { + "epoch": 6.285459411239964, + "grad_norm": 1.2859435081481934, + "learning_rate": 3.721759809750298e-06, + "loss": 0.2909, + "step": 2646 + }, + { + "epoch": 6.287838239666964, + "grad_norm": 1.4062832593917847, + "learning_rate": 3.719381688466112e-06, + "loss": 0.2994, + "step": 2647 + }, + { + "epoch": 6.2902170680939635, + "grad_norm": 1.4200578927993774, + "learning_rate": 3.7170035671819267e-06, + "loss": 0.3678, + "step": 2648 + }, + { + "epoch": 6.292595896520964, + "grad_norm": 1.404009461402893, + "learning_rate": 3.714625445897741e-06, + "loss": 0.3313, + "step": 2649 + }, + { + "epoch": 6.294974724947963, + "grad_norm": 1.2317243814468384, + "learning_rate": 3.7122473246135555e-06, + "loss": 0.3697, + "step": 2650 + }, + { + "epoch": 6.294974724947963, + "eval_loss": 0.42426997423171997, + "eval_runtime": 23.0757, + "eval_samples_per_second": 32.415, + "eval_steps_per_second": 16.208, + "step": 2650 + }, + { + "epoch": 6.297353553374963, + "grad_norm": 1.2896342277526855, + "learning_rate": 3.7098692033293696e-06, + "loss": 0.3478, + "step": 2651 + }, + { + "epoch": 6.299732381801962, + "grad_norm": 1.4232373237609863, + "learning_rate": 3.7074910820451847e-06, + "loss": 0.4, + "step": 2652 + }, + { + "epoch": 6.302111210228962, + "grad_norm": 1.36734938621521, + "learning_rate": 3.7051129607609993e-06, + "loss": 0.327, + "step": 2653 + }, + { + "epoch": 6.3044900386559615, + "grad_norm": 1.223480224609375, + "learning_rate": 3.7027348394768135e-06, + "loss": 0.2786, + "step": 2654 + }, + { + "epoch": 6.306868867082962, + "grad_norm": 1.2257388830184937, + "learning_rate": 3.700356718192628e-06, + "loss": 0.3199, + "step": 2655 + }, + { + "epoch": 6.309247695509962, + "grad_norm": 1.4102221727371216, + "learning_rate": 3.6979785969084427e-06, + "loss": 0.3299, + "step": 2656 + }, + { + "epoch": 6.311626523936961, + "grad_norm": 1.5999513864517212, + "learning_rate": 3.695600475624257e-06, + "loss": 0.4116, + "step": 2657 + }, + { + "epoch": 6.314005352363961, + "grad_norm": 1.2875629663467407, + "learning_rate": 3.6932223543400715e-06, + "loss": 0.3055, + "step": 2658 + }, + { + "epoch": 6.31638418079096, + "grad_norm": 1.3907963037490845, + "learning_rate": 3.6908442330558865e-06, + "loss": 0.305, + "step": 2659 + }, + { + "epoch": 6.31876300921796, + "grad_norm": 1.3588868379592896, + "learning_rate": 3.6884661117717007e-06, + "loss": 0.3745, + "step": 2660 + }, + { + "epoch": 6.3211418376449595, + "grad_norm": 1.326796293258667, + "learning_rate": 3.6860879904875153e-06, + "loss": 0.3576, + "step": 2661 + }, + { + "epoch": 6.32352066607196, + "grad_norm": 1.3690236806869507, + "learning_rate": 3.6837098692033295e-06, + "loss": 0.3669, + "step": 2662 + }, + { + "epoch": 6.325899494498959, + "grad_norm": 1.2932766675949097, + "learning_rate": 3.681331747919144e-06, + "loss": 0.3034, + "step": 2663 + }, + { + "epoch": 6.328278322925959, + "grad_norm": 1.1622875928878784, + "learning_rate": 3.6789536266349583e-06, + "loss": 0.3154, + "step": 2664 + }, + { + "epoch": 6.330657151352959, + "grad_norm": 1.370376467704773, + "learning_rate": 3.6765755053507733e-06, + "loss": 0.3206, + "step": 2665 + }, + { + "epoch": 6.333035979779958, + "grad_norm": 1.5431971549987793, + "learning_rate": 3.674197384066588e-06, + "loss": 0.4059, + "step": 2666 + }, + { + "epoch": 6.335414808206958, + "grad_norm": 1.4481180906295776, + "learning_rate": 3.671819262782402e-06, + "loss": 0.4121, + "step": 2667 + }, + { + "epoch": 6.337793636633958, + "grad_norm": 1.4497114419937134, + "learning_rate": 3.6694411414982168e-06, + "loss": 0.2852, + "step": 2668 + }, + { + "epoch": 6.340172465060958, + "grad_norm": 1.2185134887695312, + "learning_rate": 3.667063020214031e-06, + "loss": 0.3436, + "step": 2669 + }, + { + "epoch": 6.342551293487957, + "grad_norm": 1.4840409755706787, + "learning_rate": 3.6646848989298456e-06, + "loss": 0.3482, + "step": 2670 + }, + { + "epoch": 6.344930121914957, + "grad_norm": 1.3177366256713867, + "learning_rate": 3.66230677764566e-06, + "loss": 0.3117, + "step": 2671 + }, + { + "epoch": 6.347308950341956, + "grad_norm": 1.397926688194275, + "learning_rate": 3.6599286563614748e-06, + "loss": 0.4038, + "step": 2672 + }, + { + "epoch": 6.349687778768956, + "grad_norm": 1.3738648891448975, + "learning_rate": 3.6575505350772894e-06, + "loss": 0.3842, + "step": 2673 + }, + { + "epoch": 6.352066607195956, + "grad_norm": 1.3642029762268066, + "learning_rate": 3.655172413793104e-06, + "loss": 0.3941, + "step": 2674 + }, + { + "epoch": 6.354445435622956, + "grad_norm": 1.4505587816238403, + "learning_rate": 3.652794292508918e-06, + "loss": 0.3534, + "step": 2675 + }, + { + "epoch": 6.356824264049956, + "grad_norm": 1.4454445838928223, + "learning_rate": 3.650416171224733e-06, + "loss": 0.3439, + "step": 2676 + }, + { + "epoch": 6.359203092476955, + "grad_norm": 1.3723853826522827, + "learning_rate": 3.648038049940547e-06, + "loss": 0.3198, + "step": 2677 + }, + { + "epoch": 6.361581920903955, + "grad_norm": 1.298405408859253, + "learning_rate": 3.6456599286563616e-06, + "loss": 0.3496, + "step": 2678 + }, + { + "epoch": 6.363960749330954, + "grad_norm": 1.3599225282669067, + "learning_rate": 3.6432818073721766e-06, + "loss": 0.3512, + "step": 2679 + }, + { + "epoch": 6.366339577757954, + "grad_norm": 1.2311075925827026, + "learning_rate": 3.640903686087991e-06, + "loss": 0.3761, + "step": 2680 + }, + { + "epoch": 6.3687184061849536, + "grad_norm": 1.429534912109375, + "learning_rate": 3.6385255648038054e-06, + "loss": 0.3552, + "step": 2681 + }, + { + "epoch": 6.371097234611954, + "grad_norm": 1.493389368057251, + "learning_rate": 3.6361474435196196e-06, + "loss": 0.395, + "step": 2682 + }, + { + "epoch": 6.373476063038954, + "grad_norm": 1.6598342657089233, + "learning_rate": 3.6337693222354342e-06, + "loss": 0.4518, + "step": 2683 + }, + { + "epoch": 6.375854891465953, + "grad_norm": 1.3454574346542358, + "learning_rate": 3.631391200951249e-06, + "loss": 0.3666, + "step": 2684 + }, + { + "epoch": 6.378233719892953, + "grad_norm": 1.1488224267959595, + "learning_rate": 3.629013079667063e-06, + "loss": 0.3163, + "step": 2685 + }, + { + "epoch": 6.380612548319952, + "grad_norm": 1.3922067880630493, + "learning_rate": 3.626634958382878e-06, + "loss": 0.327, + "step": 2686 + }, + { + "epoch": 6.382991376746952, + "grad_norm": 1.356584072113037, + "learning_rate": 3.6242568370986927e-06, + "loss": 0.3375, + "step": 2687 + }, + { + "epoch": 6.385370205173952, + "grad_norm": 1.5009675025939941, + "learning_rate": 3.621878715814507e-06, + "loss": 0.3802, + "step": 2688 + }, + { + "epoch": 6.387749033600952, + "grad_norm": 1.3696675300598145, + "learning_rate": 3.6195005945303215e-06, + "loss": 0.2914, + "step": 2689 + }, + { + "epoch": 6.390127862027951, + "grad_norm": 1.6588962078094482, + "learning_rate": 3.6171224732461357e-06, + "loss": 0.3977, + "step": 2690 + }, + { + "epoch": 6.392506690454951, + "grad_norm": 1.4910554885864258, + "learning_rate": 3.6147443519619503e-06, + "loss": 0.3315, + "step": 2691 + }, + { + "epoch": 6.39488551888195, + "grad_norm": 1.4374526739120483, + "learning_rate": 3.6123662306777645e-06, + "loss": 0.3366, + "step": 2692 + }, + { + "epoch": 6.39726434730895, + "grad_norm": 1.304388403892517, + "learning_rate": 3.6099881093935795e-06, + "loss": 0.3128, + "step": 2693 + }, + { + "epoch": 6.39964317573595, + "grad_norm": 1.6092209815979004, + "learning_rate": 3.607609988109394e-06, + "loss": 0.3632, + "step": 2694 + }, + { + "epoch": 6.40202200416295, + "grad_norm": 1.5532820224761963, + "learning_rate": 3.6052318668252083e-06, + "loss": 0.4051, + "step": 2695 + }, + { + "epoch": 6.40440083258995, + "grad_norm": 1.4765859842300415, + "learning_rate": 3.602853745541023e-06, + "loss": 0.3057, + "step": 2696 + }, + { + "epoch": 6.406779661016949, + "grad_norm": 1.4265764951705933, + "learning_rate": 3.600475624256837e-06, + "loss": 0.3309, + "step": 2697 + }, + { + "epoch": 6.409158489443949, + "grad_norm": 1.3168396949768066, + "learning_rate": 3.5980975029726517e-06, + "loss": 0.348, + "step": 2698 + }, + { + "epoch": 6.411537317870948, + "grad_norm": 1.4586856365203857, + "learning_rate": 3.5957193816884667e-06, + "loss": 0.3722, + "step": 2699 + }, + { + "epoch": 6.413916146297948, + "grad_norm": 1.3999022245407104, + "learning_rate": 3.593341260404281e-06, + "loss": 0.3361, + "step": 2700 + }, + { + "epoch": 6.413916146297948, + "eval_loss": 0.42337876558303833, + "eval_runtime": 23.4415, + "eval_samples_per_second": 31.909, + "eval_steps_per_second": 15.955, + "step": 2700 + }, + { + "epoch": 6.416294974724948, + "grad_norm": 1.2211652994155884, + "learning_rate": 3.5909631391200955e-06, + "loss": 0.338, + "step": 2701 + }, + { + "epoch": 6.418673803151948, + "grad_norm": 1.385874629020691, + "learning_rate": 3.58858501783591e-06, + "loss": 0.2887, + "step": 2702 + }, + { + "epoch": 6.421052631578947, + "grad_norm": 1.3146898746490479, + "learning_rate": 3.5862068965517243e-06, + "loss": 0.2858, + "step": 2703 + }, + { + "epoch": 6.423431460005947, + "grad_norm": 1.1576685905456543, + "learning_rate": 3.583828775267539e-06, + "loss": 0.2724, + "step": 2704 + }, + { + "epoch": 6.425810288432947, + "grad_norm": 1.3750783205032349, + "learning_rate": 3.581450653983353e-06, + "loss": 0.2949, + "step": 2705 + }, + { + "epoch": 6.428189116859946, + "grad_norm": 1.319892406463623, + "learning_rate": 3.579072532699168e-06, + "loss": 0.3301, + "step": 2706 + }, + { + "epoch": 6.430567945286946, + "grad_norm": 1.5278986692428589, + "learning_rate": 3.5766944114149828e-06, + "loss": 0.3499, + "step": 2707 + }, + { + "epoch": 6.432946773713946, + "grad_norm": 1.319576382637024, + "learning_rate": 3.574316290130797e-06, + "loss": 0.3395, + "step": 2708 + }, + { + "epoch": 6.435325602140946, + "grad_norm": 1.217728614807129, + "learning_rate": 3.5719381688466116e-06, + "loss": 0.324, + "step": 2709 + }, + { + "epoch": 6.437704430567945, + "grad_norm": 1.3068724870681763, + "learning_rate": 3.5695600475624258e-06, + "loss": 0.3021, + "step": 2710 + }, + { + "epoch": 6.440083258994945, + "grad_norm": 1.326311707496643, + "learning_rate": 3.5671819262782404e-06, + "loss": 0.2922, + "step": 2711 + }, + { + "epoch": 6.442462087421944, + "grad_norm": 1.4083715677261353, + "learning_rate": 3.5648038049940546e-06, + "loss": 0.3947, + "step": 2712 + }, + { + "epoch": 6.444840915848944, + "grad_norm": 1.4286690950393677, + "learning_rate": 3.5624256837098696e-06, + "loss": 0.2971, + "step": 2713 + }, + { + "epoch": 6.4472197442759445, + "grad_norm": 1.396643042564392, + "learning_rate": 3.560047562425684e-06, + "loss": 0.2743, + "step": 2714 + }, + { + "epoch": 6.449598572702944, + "grad_norm": 1.3096516132354736, + "learning_rate": 3.5576694411414984e-06, + "loss": 0.3253, + "step": 2715 + }, + { + "epoch": 6.451977401129944, + "grad_norm": 1.544026255607605, + "learning_rate": 3.555291319857313e-06, + "loss": 0.3503, + "step": 2716 + }, + { + "epoch": 6.454356229556943, + "grad_norm": 1.5038871765136719, + "learning_rate": 3.5529131985731276e-06, + "loss": 0.3583, + "step": 2717 + }, + { + "epoch": 6.456735057983943, + "grad_norm": 1.3547450304031372, + "learning_rate": 3.550535077288942e-06, + "loss": 0.3228, + "step": 2718 + }, + { + "epoch": 6.459113886410942, + "grad_norm": 1.3871971368789673, + "learning_rate": 3.5481569560047564e-06, + "loss": 0.3372, + "step": 2719 + }, + { + "epoch": 6.461492714837942, + "grad_norm": 1.414191484451294, + "learning_rate": 3.5457788347205714e-06, + "loss": 0.3698, + "step": 2720 + }, + { + "epoch": 6.463871543264942, + "grad_norm": 1.565316081047058, + "learning_rate": 3.5434007134363856e-06, + "loss": 0.2928, + "step": 2721 + }, + { + "epoch": 6.466250371691942, + "grad_norm": 1.6727120876312256, + "learning_rate": 3.5410225921522002e-06, + "loss": 0.3897, + "step": 2722 + }, + { + "epoch": 6.468629200118942, + "grad_norm": 1.495957612991333, + "learning_rate": 3.5386444708680144e-06, + "loss": 0.3699, + "step": 2723 + }, + { + "epoch": 6.471008028545941, + "grad_norm": 1.687485694885254, + "learning_rate": 3.536266349583829e-06, + "loss": 0.4111, + "step": 2724 + }, + { + "epoch": 6.473386856972941, + "grad_norm": 1.119472861289978, + "learning_rate": 3.5338882282996432e-06, + "loss": 0.283, + "step": 2725 + }, + { + "epoch": 6.47576568539994, + "grad_norm": 1.485710859298706, + "learning_rate": 3.531510107015458e-06, + "loss": 0.426, + "step": 2726 + }, + { + "epoch": 6.4781445138269405, + "grad_norm": 1.458084225654602, + "learning_rate": 3.529131985731273e-06, + "loss": 0.3695, + "step": 2727 + }, + { + "epoch": 6.48052334225394, + "grad_norm": 1.264891505241394, + "learning_rate": 3.526753864447087e-06, + "loss": 0.3328, + "step": 2728 + }, + { + "epoch": 6.48290217068094, + "grad_norm": 1.3362623453140259, + "learning_rate": 3.5243757431629017e-06, + "loss": 0.3306, + "step": 2729 + }, + { + "epoch": 6.485280999107939, + "grad_norm": 1.4146660566329956, + "learning_rate": 3.5219976218787163e-06, + "loss": 0.3329, + "step": 2730 + }, + { + "epoch": 6.487659827534939, + "grad_norm": 1.6272141933441162, + "learning_rate": 3.5196195005945305e-06, + "loss": 0.3776, + "step": 2731 + }, + { + "epoch": 6.490038655961939, + "grad_norm": 1.3792216777801514, + "learning_rate": 3.517241379310345e-06, + "loss": 0.2761, + "step": 2732 + }, + { + "epoch": 6.492417484388938, + "grad_norm": 1.3296266794204712, + "learning_rate": 3.5148632580261593e-06, + "loss": 0.4062, + "step": 2733 + }, + { + "epoch": 6.4947963128159385, + "grad_norm": 1.2744592428207397, + "learning_rate": 3.5124851367419743e-06, + "loss": 0.374, + "step": 2734 + }, + { + "epoch": 6.497175141242938, + "grad_norm": 1.3845752477645874, + "learning_rate": 3.510107015457789e-06, + "loss": 0.3887, + "step": 2735 + }, + { + "epoch": 6.499553969669938, + "grad_norm": 1.2604643106460571, + "learning_rate": 3.507728894173603e-06, + "loss": 0.3176, + "step": 2736 + }, + { + "epoch": 6.501932798096937, + "grad_norm": 1.4110983610153198, + "learning_rate": 3.5053507728894177e-06, + "loss": 0.3831, + "step": 2737 + }, + { + "epoch": 6.504311626523937, + "grad_norm": 1.3532415628433228, + "learning_rate": 3.502972651605232e-06, + "loss": 0.3355, + "step": 2738 + }, + { + "epoch": 6.506690454950936, + "grad_norm": 1.6337836980819702, + "learning_rate": 3.5005945303210465e-06, + "loss": 0.3565, + "step": 2739 + }, + { + "epoch": 6.509069283377936, + "grad_norm": 1.3413400650024414, + "learning_rate": 3.4982164090368615e-06, + "loss": 0.3474, + "step": 2740 + }, + { + "epoch": 6.5114481118049365, + "grad_norm": 1.335922360420227, + "learning_rate": 3.4958382877526757e-06, + "loss": 0.2896, + "step": 2741 + }, + { + "epoch": 6.513826940231936, + "grad_norm": 1.402234673500061, + "learning_rate": 3.4934601664684903e-06, + "loss": 0.3611, + "step": 2742 + }, + { + "epoch": 6.516205768658936, + "grad_norm": 1.334345817565918, + "learning_rate": 3.4910820451843045e-06, + "loss": 0.3327, + "step": 2743 + }, + { + "epoch": 6.518584597085935, + "grad_norm": 1.431458592414856, + "learning_rate": 3.488703923900119e-06, + "loss": 0.3792, + "step": 2744 + }, + { + "epoch": 6.520963425512935, + "grad_norm": 1.4619853496551514, + "learning_rate": 3.4863258026159337e-06, + "loss": 0.3669, + "step": 2745 + }, + { + "epoch": 6.523342253939934, + "grad_norm": 1.3710017204284668, + "learning_rate": 3.483947681331748e-06, + "loss": 0.2804, + "step": 2746 + }, + { + "epoch": 6.5257210823669345, + "grad_norm": 1.43901526927948, + "learning_rate": 3.481569560047563e-06, + "loss": 0.337, + "step": 2747 + }, + { + "epoch": 6.528099910793934, + "grad_norm": 1.3881027698516846, + "learning_rate": 3.4791914387633776e-06, + "loss": 0.3244, + "step": 2748 + }, + { + "epoch": 6.530478739220934, + "grad_norm": 1.2311205863952637, + "learning_rate": 3.4768133174791918e-06, + "loss": 0.2976, + "step": 2749 + }, + { + "epoch": 6.532857567647933, + "grad_norm": 1.5653150081634521, + "learning_rate": 3.4744351961950064e-06, + "loss": 0.341, + "step": 2750 + }, + { + "epoch": 6.532857567647933, + "eval_loss": 0.4250675439834595, + "eval_runtime": 23.0403, + "eval_samples_per_second": 32.465, + "eval_steps_per_second": 16.232, + "step": 2750 + }, + { + "epoch": 6.535236396074933, + "grad_norm": 1.4294503927230835, + "learning_rate": 3.4720570749108206e-06, + "loss": 0.3285, + "step": 2751 + }, + { + "epoch": 6.537615224501932, + "grad_norm": 1.5753880739212036, + "learning_rate": 3.469678953626635e-06, + "loss": 0.4126, + "step": 2752 + }, + { + "epoch": 6.539994052928932, + "grad_norm": 1.4154236316680908, + "learning_rate": 3.4673008323424494e-06, + "loss": 0.3764, + "step": 2753 + }, + { + "epoch": 6.5423728813559325, + "grad_norm": 1.217604637145996, + "learning_rate": 3.4649227110582644e-06, + "loss": 0.3521, + "step": 2754 + }, + { + "epoch": 6.544751709782932, + "grad_norm": 1.2494088411331177, + "learning_rate": 3.462544589774079e-06, + "loss": 0.3319, + "step": 2755 + }, + { + "epoch": 6.547130538209932, + "grad_norm": 1.2701776027679443, + "learning_rate": 3.460166468489893e-06, + "loss": 0.3674, + "step": 2756 + }, + { + "epoch": 6.549509366636931, + "grad_norm": 1.337778091430664, + "learning_rate": 3.457788347205708e-06, + "loss": 0.3392, + "step": 2757 + }, + { + "epoch": 6.551888195063931, + "grad_norm": 1.6449202299118042, + "learning_rate": 3.455410225921522e-06, + "loss": 0.3608, + "step": 2758 + }, + { + "epoch": 6.55426702349093, + "grad_norm": 1.3283525705337524, + "learning_rate": 3.4530321046373366e-06, + "loss": 0.4074, + "step": 2759 + }, + { + "epoch": 6.5566458519179305, + "grad_norm": 1.3753374814987183, + "learning_rate": 3.4506539833531512e-06, + "loss": 0.3899, + "step": 2760 + }, + { + "epoch": 6.55902468034493, + "grad_norm": 1.5532865524291992, + "learning_rate": 3.448275862068966e-06, + "loss": 0.3304, + "step": 2761 + }, + { + "epoch": 6.56140350877193, + "grad_norm": 1.3218581676483154, + "learning_rate": 3.4458977407847804e-06, + "loss": 0.3139, + "step": 2762 + }, + { + "epoch": 6.56378233719893, + "grad_norm": 1.3597853183746338, + "learning_rate": 3.443519619500595e-06, + "loss": 0.3221, + "step": 2763 + }, + { + "epoch": 6.566161165625929, + "grad_norm": 1.4700864553451538, + "learning_rate": 3.4411414982164092e-06, + "loss": 0.372, + "step": 2764 + }, + { + "epoch": 6.568539994052929, + "grad_norm": 1.424184799194336, + "learning_rate": 3.438763376932224e-06, + "loss": 0.3864, + "step": 2765 + }, + { + "epoch": 6.570918822479928, + "grad_norm": 1.652013897895813, + "learning_rate": 3.436385255648038e-06, + "loss": 0.3999, + "step": 2766 + }, + { + "epoch": 6.5732976509069285, + "grad_norm": 1.5024573802947998, + "learning_rate": 3.4340071343638526e-06, + "loss": 0.3424, + "step": 2767 + }, + { + "epoch": 6.575676479333928, + "grad_norm": 1.5580182075500488, + "learning_rate": 3.4316290130796677e-06, + "loss": 0.3793, + "step": 2768 + }, + { + "epoch": 6.578055307760928, + "grad_norm": 1.4185994863510132, + "learning_rate": 3.429250891795482e-06, + "loss": 0.3668, + "step": 2769 + }, + { + "epoch": 6.580434136187927, + "grad_norm": 1.6796863079071045, + "learning_rate": 3.4268727705112965e-06, + "loss": 0.4009, + "step": 2770 + }, + { + "epoch": 6.582812964614927, + "grad_norm": 1.472289800643921, + "learning_rate": 3.4244946492271107e-06, + "loss": 0.407, + "step": 2771 + }, + { + "epoch": 6.585191793041927, + "grad_norm": 1.490464448928833, + "learning_rate": 3.4221165279429253e-06, + "loss": 0.3678, + "step": 2772 + }, + { + "epoch": 6.5875706214689265, + "grad_norm": 1.528519868850708, + "learning_rate": 3.41973840665874e-06, + "loss": 0.4044, + "step": 2773 + }, + { + "epoch": 6.589949449895927, + "grad_norm": 1.3567736148834229, + "learning_rate": 3.417360285374554e-06, + "loss": 0.3677, + "step": 2774 + }, + { + "epoch": 6.592328278322926, + "grad_norm": 1.2657711505889893, + "learning_rate": 3.414982164090369e-06, + "loss": 0.3716, + "step": 2775 + }, + { + "epoch": 6.594707106749926, + "grad_norm": 1.407591700553894, + "learning_rate": 3.4126040428061837e-06, + "loss": 0.308, + "step": 2776 + }, + { + "epoch": 6.597085935176925, + "grad_norm": 1.3087775707244873, + "learning_rate": 3.410225921521998e-06, + "loss": 0.3957, + "step": 2777 + }, + { + "epoch": 6.599464763603925, + "grad_norm": 1.3723517656326294, + "learning_rate": 3.4078478002378125e-06, + "loss": 0.3463, + "step": 2778 + }, + { + "epoch": 6.601843592030924, + "grad_norm": 1.3305402994155884, + "learning_rate": 3.4054696789536267e-06, + "loss": 0.2908, + "step": 2779 + }, + { + "epoch": 6.6042224204579245, + "grad_norm": 1.3189060688018799, + "learning_rate": 3.4030915576694413e-06, + "loss": 0.3335, + "step": 2780 + }, + { + "epoch": 6.606601248884925, + "grad_norm": 1.5175609588623047, + "learning_rate": 3.4007134363852564e-06, + "loss": 0.3319, + "step": 2781 + }, + { + "epoch": 6.608980077311924, + "grad_norm": 1.448644995689392, + "learning_rate": 3.3983353151010705e-06, + "loss": 0.3431, + "step": 2782 + }, + { + "epoch": 6.611358905738924, + "grad_norm": 1.4072167873382568, + "learning_rate": 3.395957193816885e-06, + "loss": 0.3149, + "step": 2783 + }, + { + "epoch": 6.613737734165923, + "grad_norm": 1.4189567565917969, + "learning_rate": 3.3935790725326993e-06, + "loss": 0.3719, + "step": 2784 + }, + { + "epoch": 6.616116562592923, + "grad_norm": 1.4861245155334473, + "learning_rate": 3.391200951248514e-06, + "loss": 0.3291, + "step": 2785 + }, + { + "epoch": 6.6184953910199225, + "grad_norm": 1.5126738548278809, + "learning_rate": 3.388822829964328e-06, + "loss": 0.3651, + "step": 2786 + }, + { + "epoch": 6.620874219446923, + "grad_norm": 1.3647444248199463, + "learning_rate": 3.3864447086801427e-06, + "loss": 0.3766, + "step": 2787 + }, + { + "epoch": 6.623253047873922, + "grad_norm": 1.402726173400879, + "learning_rate": 3.3840665873959578e-06, + "loss": 0.3313, + "step": 2788 + }, + { + "epoch": 6.625631876300922, + "grad_norm": 1.5462816953659058, + "learning_rate": 3.381688466111772e-06, + "loss": 0.3532, + "step": 2789 + }, + { + "epoch": 6.628010704727922, + "grad_norm": 1.3263099193572998, + "learning_rate": 3.3793103448275866e-06, + "loss": 0.3413, + "step": 2790 + }, + { + "epoch": 6.630389533154921, + "grad_norm": 1.5322917699813843, + "learning_rate": 3.376932223543401e-06, + "loss": 0.3952, + "step": 2791 + }, + { + "epoch": 6.632768361581921, + "grad_norm": 1.417512059211731, + "learning_rate": 3.3745541022592154e-06, + "loss": 0.33, + "step": 2792 + }, + { + "epoch": 6.6351471900089205, + "grad_norm": 1.4467958211898804, + "learning_rate": 3.37217598097503e-06, + "loss": 0.3387, + "step": 2793 + }, + { + "epoch": 6.637526018435921, + "grad_norm": 1.2188092470169067, + "learning_rate": 3.369797859690844e-06, + "loss": 0.3403, + "step": 2794 + }, + { + "epoch": 6.63990484686292, + "grad_norm": 1.4625403881072998, + "learning_rate": 3.367419738406659e-06, + "loss": 0.331, + "step": 2795 + }, + { + "epoch": 6.64228367528992, + "grad_norm": 1.3032160997390747, + "learning_rate": 3.365041617122474e-06, + "loss": 0.3157, + "step": 2796 + }, + { + "epoch": 6.644662503716919, + "grad_norm": 1.2930092811584473, + "learning_rate": 3.362663495838288e-06, + "loss": 0.3113, + "step": 2797 + }, + { + "epoch": 6.647041332143919, + "grad_norm": 1.5683459043502808, + "learning_rate": 3.3602853745541026e-06, + "loss": 0.4255, + "step": 2798 + }, + { + "epoch": 6.6494201605709184, + "grad_norm": 1.2718632221221924, + "learning_rate": 3.357907253269917e-06, + "loss": 0.3106, + "step": 2799 + }, + { + "epoch": 6.6517989889979185, + "grad_norm": 1.2917662858963013, + "learning_rate": 3.3555291319857314e-06, + "loss": 0.3828, + "step": 2800 + }, + { + "epoch": 6.6517989889979185, + "eval_loss": 0.42329561710357666, + "eval_runtime": 23.0643, + "eval_samples_per_second": 32.431, + "eval_steps_per_second": 16.216, + "step": 2800 + }, + { + "epoch": 6.654177817424918, + "grad_norm": 1.5991885662078857, + "learning_rate": 3.3531510107015456e-06, + "loss": 0.4017, + "step": 2801 + }, + { + "epoch": 6.656556645851918, + "grad_norm": 1.510757565498352, + "learning_rate": 3.3507728894173606e-06, + "loss": 0.3122, + "step": 2802 + }, + { + "epoch": 6.658935474278918, + "grad_norm": 1.469111680984497, + "learning_rate": 3.3483947681331753e-06, + "loss": 0.4397, + "step": 2803 + }, + { + "epoch": 6.661314302705917, + "grad_norm": 1.3826594352722168, + "learning_rate": 3.3460166468489894e-06, + "loss": 0.2846, + "step": 2804 + }, + { + "epoch": 6.663693131132917, + "grad_norm": 1.1618965864181519, + "learning_rate": 3.343638525564804e-06, + "loss": 0.2974, + "step": 2805 + }, + { + "epoch": 6.6660719595599165, + "grad_norm": 1.4580121040344238, + "learning_rate": 3.3412604042806187e-06, + "loss": 0.3332, + "step": 2806 + }, + { + "epoch": 6.668450787986917, + "grad_norm": 1.4929895401000977, + "learning_rate": 3.338882282996433e-06, + "loss": 0.3798, + "step": 2807 + }, + { + "epoch": 6.670829616413916, + "grad_norm": 1.2745121717453003, + "learning_rate": 3.3365041617122475e-06, + "loss": 0.3519, + "step": 2808 + }, + { + "epoch": 6.673208444840916, + "grad_norm": 1.586761713027954, + "learning_rate": 3.3341260404280625e-06, + "loss": 0.3393, + "step": 2809 + }, + { + "epoch": 6.675587273267915, + "grad_norm": 1.4201958179473877, + "learning_rate": 3.3317479191438767e-06, + "loss": 0.2815, + "step": 2810 + }, + { + "epoch": 6.677966101694915, + "grad_norm": 1.4583454132080078, + "learning_rate": 3.3293697978596913e-06, + "loss": 0.3435, + "step": 2811 + }, + { + "epoch": 6.680344930121915, + "grad_norm": 1.400944471359253, + "learning_rate": 3.3269916765755055e-06, + "loss": 0.3385, + "step": 2812 + }, + { + "epoch": 6.6827237585489145, + "grad_norm": 1.2988190650939941, + "learning_rate": 3.32461355529132e-06, + "loss": 0.3025, + "step": 2813 + }, + { + "epoch": 6.685102586975915, + "grad_norm": 1.1990025043487549, + "learning_rate": 3.3222354340071343e-06, + "loss": 0.2824, + "step": 2814 + }, + { + "epoch": 6.687481415402914, + "grad_norm": 1.328501582145691, + "learning_rate": 3.319857312722949e-06, + "loss": 0.2957, + "step": 2815 + }, + { + "epoch": 6.689860243829914, + "grad_norm": 1.4951635599136353, + "learning_rate": 3.317479191438764e-06, + "loss": 0.3649, + "step": 2816 + }, + { + "epoch": 6.692239072256913, + "grad_norm": 1.3397221565246582, + "learning_rate": 3.315101070154578e-06, + "loss": 0.3765, + "step": 2817 + }, + { + "epoch": 6.694617900683913, + "grad_norm": 1.3665337562561035, + "learning_rate": 3.3127229488703927e-06, + "loss": 0.2885, + "step": 2818 + }, + { + "epoch": 6.6969967291109125, + "grad_norm": 1.4334619045257568, + "learning_rate": 3.3103448275862073e-06, + "loss": 0.3286, + "step": 2819 + }, + { + "epoch": 6.699375557537913, + "grad_norm": 1.490972876548767, + "learning_rate": 3.3079667063020215e-06, + "loss": 0.3467, + "step": 2820 + }, + { + "epoch": 6.701754385964913, + "grad_norm": 1.4246447086334229, + "learning_rate": 3.305588585017836e-06, + "loss": 0.3244, + "step": 2821 + }, + { + "epoch": 6.704133214391912, + "grad_norm": 1.3430454730987549, + "learning_rate": 3.3032104637336507e-06, + "loss": 0.3786, + "step": 2822 + }, + { + "epoch": 6.706512042818912, + "grad_norm": 1.5046801567077637, + "learning_rate": 3.3008323424494654e-06, + "loss": 0.3905, + "step": 2823 + }, + { + "epoch": 6.708890871245911, + "grad_norm": 1.2306205034255981, + "learning_rate": 3.29845422116528e-06, + "loss": 0.3101, + "step": 2824 + }, + { + "epoch": 6.711269699672911, + "grad_norm": 1.4948253631591797, + "learning_rate": 3.296076099881094e-06, + "loss": 0.3395, + "step": 2825 + }, + { + "epoch": 6.7136485280999105, + "grad_norm": 1.3491920232772827, + "learning_rate": 3.2936979785969088e-06, + "loss": 0.3107, + "step": 2826 + }, + { + "epoch": 6.716027356526911, + "grad_norm": 1.4511719942092896, + "learning_rate": 3.291319857312723e-06, + "loss": 0.3426, + "step": 2827 + }, + { + "epoch": 6.71840618495391, + "grad_norm": 1.3420836925506592, + "learning_rate": 3.2889417360285376e-06, + "loss": 0.3256, + "step": 2828 + }, + { + "epoch": 6.72078501338091, + "grad_norm": 1.3855271339416504, + "learning_rate": 3.2865636147443526e-06, + "loss": 0.3735, + "step": 2829 + }, + { + "epoch": 6.72316384180791, + "grad_norm": 1.4537755250930786, + "learning_rate": 3.2841854934601668e-06, + "loss": 0.3266, + "step": 2830 + }, + { + "epoch": 6.725542670234909, + "grad_norm": 1.5638060569763184, + "learning_rate": 3.2818073721759814e-06, + "loss": 0.3239, + "step": 2831 + }, + { + "epoch": 6.727921498661909, + "grad_norm": 1.4797362089157104, + "learning_rate": 3.2794292508917956e-06, + "loss": 0.3582, + "step": 2832 + }, + { + "epoch": 6.730300327088909, + "grad_norm": 1.8546558618545532, + "learning_rate": 3.27705112960761e-06, + "loss": 0.4139, + "step": 2833 + }, + { + "epoch": 6.732679155515909, + "grad_norm": 1.4836021661758423, + "learning_rate": 3.274673008323425e-06, + "loss": 0.3835, + "step": 2834 + }, + { + "epoch": 6.735057983942908, + "grad_norm": 1.5389071702957153, + "learning_rate": 3.272294887039239e-06, + "loss": 0.3396, + "step": 2835 + }, + { + "epoch": 6.737436812369908, + "grad_norm": 1.3102928400039673, + "learning_rate": 3.269916765755054e-06, + "loss": 0.314, + "step": 2836 + }, + { + "epoch": 6.739815640796907, + "grad_norm": 1.3666446208953857, + "learning_rate": 3.2675386444708686e-06, + "loss": 0.3828, + "step": 2837 + }, + { + "epoch": 6.742194469223907, + "grad_norm": 1.3665192127227783, + "learning_rate": 3.265160523186683e-06, + "loss": 0.3387, + "step": 2838 + }, + { + "epoch": 6.744573297650907, + "grad_norm": 1.454422950744629, + "learning_rate": 3.2627824019024974e-06, + "loss": 0.3656, + "step": 2839 + }, + { + "epoch": 6.746952126077907, + "grad_norm": 1.3154356479644775, + "learning_rate": 3.2604042806183116e-06, + "loss": 0.345, + "step": 2840 + }, + { + "epoch": 6.749330954504907, + "grad_norm": 1.37127685546875, + "learning_rate": 3.2580261593341262e-06, + "loss": 0.3683, + "step": 2841 + }, + { + "epoch": 6.751709782931906, + "grad_norm": 1.2092419862747192, + "learning_rate": 3.2556480380499404e-06, + "loss": 0.3082, + "step": 2842 + }, + { + "epoch": 6.754088611358906, + "grad_norm": 1.3536608219146729, + "learning_rate": 3.2532699167657555e-06, + "loss": 0.2831, + "step": 2843 + }, + { + "epoch": 6.756467439785905, + "grad_norm": 1.4086867570877075, + "learning_rate": 3.25089179548157e-06, + "loss": 0.3854, + "step": 2844 + }, + { + "epoch": 6.758846268212905, + "grad_norm": 1.3574670553207397, + "learning_rate": 3.2485136741973842e-06, + "loss": 0.2997, + "step": 2845 + }, + { + "epoch": 6.761225096639905, + "grad_norm": 1.2371854782104492, + "learning_rate": 3.246135552913199e-06, + "loss": 0.3211, + "step": 2846 + }, + { + "epoch": 6.763603925066905, + "grad_norm": 1.4218463897705078, + "learning_rate": 3.243757431629013e-06, + "loss": 0.3877, + "step": 2847 + }, + { + "epoch": 6.765982753493904, + "grad_norm": 1.2251644134521484, + "learning_rate": 3.2413793103448277e-06, + "loss": 0.3159, + "step": 2848 + }, + { + "epoch": 6.768361581920904, + "grad_norm": 1.3835780620574951, + "learning_rate": 3.2390011890606423e-06, + "loss": 0.2893, + "step": 2849 + }, + { + "epoch": 6.770740410347903, + "grad_norm": 1.5152909755706787, + "learning_rate": 3.236623067776457e-06, + "loss": 0.4466, + "step": 2850 + }, + { + "epoch": 6.770740410347903, + "eval_loss": 0.4229939877986908, + "eval_runtime": 23.0709, + "eval_samples_per_second": 32.422, + "eval_steps_per_second": 16.211, + "step": 2850 + }, + { + "epoch": 6.773119238774903, + "grad_norm": 1.3829107284545898, + "learning_rate": 3.2342449464922715e-06, + "loss": 0.4201, + "step": 2851 + }, + { + "epoch": 6.775498067201903, + "grad_norm": 1.2810947895050049, + "learning_rate": 3.231866825208086e-06, + "loss": 0.3315, + "step": 2852 + }, + { + "epoch": 6.777876895628903, + "grad_norm": 1.4317586421966553, + "learning_rate": 3.2294887039239003e-06, + "loss": 0.2884, + "step": 2853 + }, + { + "epoch": 6.780255724055903, + "grad_norm": 1.3490461111068726, + "learning_rate": 3.227110582639715e-06, + "loss": 0.3504, + "step": 2854 + }, + { + "epoch": 6.782634552482902, + "grad_norm": 1.2938369512557983, + "learning_rate": 3.224732461355529e-06, + "loss": 0.299, + "step": 2855 + }, + { + "epoch": 6.785013380909902, + "grad_norm": 1.1591302156448364, + "learning_rate": 3.222354340071344e-06, + "loss": 0.295, + "step": 2856 + }, + { + "epoch": 6.787392209336901, + "grad_norm": 1.4754201173782349, + "learning_rate": 3.2199762187871587e-06, + "loss": 0.3437, + "step": 2857 + }, + { + "epoch": 6.789771037763901, + "grad_norm": 1.419091820716858, + "learning_rate": 3.217598097502973e-06, + "loss": 0.3786, + "step": 2858 + }, + { + "epoch": 6.7921498661909006, + "grad_norm": 1.4479961395263672, + "learning_rate": 3.2152199762187875e-06, + "loss": 0.3659, + "step": 2859 + }, + { + "epoch": 6.794528694617901, + "grad_norm": 1.3166942596435547, + "learning_rate": 3.2128418549346017e-06, + "loss": 0.3583, + "step": 2860 + }, + { + "epoch": 6.796907523044901, + "grad_norm": 1.3683496713638306, + "learning_rate": 3.2104637336504163e-06, + "loss": 0.3442, + "step": 2861 + }, + { + "epoch": 6.7992863514719, + "grad_norm": 1.2312074899673462, + "learning_rate": 3.2080856123662305e-06, + "loss": 0.3175, + "step": 2862 + }, + { + "epoch": 6.8016651798989, + "grad_norm": 1.4744106531143188, + "learning_rate": 3.2057074910820456e-06, + "loss": 0.3085, + "step": 2863 + }, + { + "epoch": 6.804044008325899, + "grad_norm": 1.5414005517959595, + "learning_rate": 3.20332936979786e-06, + "loss": 0.3493, + "step": 2864 + }, + { + "epoch": 6.806422836752899, + "grad_norm": 1.4385278224945068, + "learning_rate": 3.2009512485136743e-06, + "loss": 0.3333, + "step": 2865 + }, + { + "epoch": 6.808801665179899, + "grad_norm": 1.364806890487671, + "learning_rate": 3.198573127229489e-06, + "loss": 0.3126, + "step": 2866 + }, + { + "epoch": 6.811180493606899, + "grad_norm": 1.360108494758606, + "learning_rate": 3.1961950059453036e-06, + "loss": 0.4024, + "step": 2867 + }, + { + "epoch": 6.813559322033898, + "grad_norm": 1.4987859725952148, + "learning_rate": 3.1938168846611178e-06, + "loss": 0.3306, + "step": 2868 + }, + { + "epoch": 6.815938150460898, + "grad_norm": 1.370896816253662, + "learning_rate": 3.1914387633769324e-06, + "loss": 0.3036, + "step": 2869 + }, + { + "epoch": 6.818316978887898, + "grad_norm": 1.4347071647644043, + "learning_rate": 3.1890606420927474e-06, + "loss": 0.3253, + "step": 2870 + }, + { + "epoch": 6.820695807314897, + "grad_norm": 1.4869388341903687, + "learning_rate": 3.1866825208085616e-06, + "loss": 0.421, + "step": 2871 + }, + { + "epoch": 6.823074635741897, + "grad_norm": 1.5328190326690674, + "learning_rate": 3.184304399524376e-06, + "loss": 0.3551, + "step": 2872 + }, + { + "epoch": 6.825453464168897, + "grad_norm": 1.5913931131362915, + "learning_rate": 3.1819262782401904e-06, + "loss": 0.3936, + "step": 2873 + }, + { + "epoch": 6.827832292595897, + "grad_norm": 1.3014717102050781, + "learning_rate": 3.179548156956005e-06, + "loss": 0.2808, + "step": 2874 + }, + { + "epoch": 6.830211121022896, + "grad_norm": 1.2350847721099854, + "learning_rate": 3.177170035671819e-06, + "loss": 0.3616, + "step": 2875 + }, + { + "epoch": 6.832589949449896, + "grad_norm": 1.6136014461517334, + "learning_rate": 3.174791914387634e-06, + "loss": 0.3773, + "step": 2876 + }, + { + "epoch": 6.834968777876895, + "grad_norm": 1.563650131225586, + "learning_rate": 3.172413793103449e-06, + "loss": 0.4348, + "step": 2877 + }, + { + "epoch": 6.837347606303895, + "grad_norm": 1.4426994323730469, + "learning_rate": 3.170035671819263e-06, + "loss": 0.347, + "step": 2878 + }, + { + "epoch": 6.8397264347308955, + "grad_norm": 1.1924318075180054, + "learning_rate": 3.1676575505350776e-06, + "loss": 0.3306, + "step": 2879 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 1.6423641443252563, + "learning_rate": 3.1652794292508922e-06, + "loss": 0.4145, + "step": 2880 + }, + { + "epoch": 6.844484091584895, + "grad_norm": 1.493494987487793, + "learning_rate": 3.1629013079667064e-06, + "loss": 0.3295, + "step": 2881 + }, + { + "epoch": 6.846862920011894, + "grad_norm": 1.4102424383163452, + "learning_rate": 3.160523186682521e-06, + "loss": 0.3009, + "step": 2882 + }, + { + "epoch": 6.849241748438894, + "grad_norm": 1.456471562385559, + "learning_rate": 3.1581450653983352e-06, + "loss": 0.3717, + "step": 2883 + }, + { + "epoch": 6.851620576865893, + "grad_norm": 1.482839584350586, + "learning_rate": 3.1557669441141503e-06, + "loss": 0.3387, + "step": 2884 + }, + { + "epoch": 6.853999405292893, + "grad_norm": 1.4746038913726807, + "learning_rate": 3.153388822829965e-06, + "loss": 0.3167, + "step": 2885 + }, + { + "epoch": 6.856378233719893, + "grad_norm": 1.3691916465759277, + "learning_rate": 3.151010701545779e-06, + "loss": 0.2866, + "step": 2886 + }, + { + "epoch": 6.858757062146893, + "grad_norm": 1.6208133697509766, + "learning_rate": 3.1486325802615937e-06, + "loss": 0.3157, + "step": 2887 + }, + { + "epoch": 6.861135890573893, + "grad_norm": 1.4449056386947632, + "learning_rate": 3.146254458977408e-06, + "loss": 0.3043, + "step": 2888 + }, + { + "epoch": 6.863514719000892, + "grad_norm": 1.4025135040283203, + "learning_rate": 3.1438763376932225e-06, + "loss": 0.3846, + "step": 2889 + }, + { + "epoch": 6.865893547427892, + "grad_norm": 1.4334772825241089, + "learning_rate": 3.1414982164090367e-06, + "loss": 0.3961, + "step": 2890 + }, + { + "epoch": 6.868272375854891, + "grad_norm": 1.4286003112792969, + "learning_rate": 3.1391200951248517e-06, + "loss": 0.3087, + "step": 2891 + }, + { + "epoch": 6.8706512042818915, + "grad_norm": 1.3459912538528442, + "learning_rate": 3.1367419738406663e-06, + "loss": 0.3145, + "step": 2892 + }, + { + "epoch": 6.873030032708891, + "grad_norm": 1.3344213962554932, + "learning_rate": 3.1343638525564805e-06, + "loss": 0.2958, + "step": 2893 + }, + { + "epoch": 6.875408861135891, + "grad_norm": 1.4911201000213623, + "learning_rate": 3.131985731272295e-06, + "loss": 0.3593, + "step": 2894 + }, + { + "epoch": 6.87778768956289, + "grad_norm": 1.3623155355453491, + "learning_rate": 3.1296076099881097e-06, + "loss": 0.3677, + "step": 2895 + }, + { + "epoch": 6.88016651798989, + "grad_norm": 1.5945944786071777, + "learning_rate": 3.127229488703924e-06, + "loss": 0.3921, + "step": 2896 + }, + { + "epoch": 6.882545346416889, + "grad_norm": 1.2297807931900024, + "learning_rate": 3.124851367419739e-06, + "loss": 0.351, + "step": 2897 + }, + { + "epoch": 6.884924174843889, + "grad_norm": 1.4650846719741821, + "learning_rate": 3.1224732461355535e-06, + "loss": 0.3069, + "step": 2898 + }, + { + "epoch": 6.887303003270889, + "grad_norm": 1.760699987411499, + "learning_rate": 3.1200951248513677e-06, + "loss": 0.3073, + "step": 2899 + }, + { + "epoch": 6.889681831697889, + "grad_norm": 1.454857587814331, + "learning_rate": 3.1177170035671823e-06, + "loss": 0.3549, + "step": 2900 + }, + { + "epoch": 6.889681831697889, + "eval_loss": 0.4238797426223755, + "eval_runtime": 23.0704, + "eval_samples_per_second": 32.422, + "eval_steps_per_second": 16.211, + "step": 2900 + }, + { + "epoch": 6.892060660124889, + "grad_norm": 1.5653049945831299, + "learning_rate": 3.1153388822829965e-06, + "loss": 0.3827, + "step": 2901 + }, + { + "epoch": 6.894439488551888, + "grad_norm": 1.6649552583694458, + "learning_rate": 3.112960760998811e-06, + "loss": 0.4084, + "step": 2902 + }, + { + "epoch": 6.896818316978888, + "grad_norm": 1.5699725151062012, + "learning_rate": 3.1105826397146253e-06, + "loss": 0.3769, + "step": 2903 + }, + { + "epoch": 6.899197145405887, + "grad_norm": 1.442388653755188, + "learning_rate": 3.1082045184304404e-06, + "loss": 0.328, + "step": 2904 + }, + { + "epoch": 6.9015759738328875, + "grad_norm": 1.3194785118103027, + "learning_rate": 3.105826397146255e-06, + "loss": 0.264, + "step": 2905 + }, + { + "epoch": 6.903954802259887, + "grad_norm": 1.406943678855896, + "learning_rate": 3.103448275862069e-06, + "loss": 0.3627, + "step": 2906 + }, + { + "epoch": 6.906333630686887, + "grad_norm": 1.5861262083053589, + "learning_rate": 3.1010701545778838e-06, + "loss": 0.4748, + "step": 2907 + }, + { + "epoch": 6.908712459113886, + "grad_norm": 1.341352939605713, + "learning_rate": 3.098692033293698e-06, + "loss": 0.3262, + "step": 2908 + }, + { + "epoch": 6.911091287540886, + "grad_norm": 1.264319658279419, + "learning_rate": 3.0963139120095126e-06, + "loss": 0.2933, + "step": 2909 + }, + { + "epoch": 6.913470115967886, + "grad_norm": 1.465239405632019, + "learning_rate": 3.093935790725327e-06, + "loss": 0.312, + "step": 2910 + }, + { + "epoch": 6.915848944394885, + "grad_norm": 1.8252347707748413, + "learning_rate": 3.091557669441142e-06, + "loss": 0.4051, + "step": 2911 + }, + { + "epoch": 6.9182277728218855, + "grad_norm": 1.3312535285949707, + "learning_rate": 3.0891795481569564e-06, + "loss": 0.3312, + "step": 2912 + }, + { + "epoch": 6.920606601248885, + "grad_norm": 1.6212743520736694, + "learning_rate": 3.086801426872771e-06, + "loss": 0.3533, + "step": 2913 + }, + { + "epoch": 6.922985429675885, + "grad_norm": 1.3863298892974854, + "learning_rate": 3.084423305588585e-06, + "loss": 0.3674, + "step": 2914 + }, + { + "epoch": 6.925364258102884, + "grad_norm": 1.303475022315979, + "learning_rate": 3.0820451843044e-06, + "loss": 0.3251, + "step": 2915 + }, + { + "epoch": 6.927743086529884, + "grad_norm": 1.6169662475585938, + "learning_rate": 3.079667063020214e-06, + "loss": 0.3847, + "step": 2916 + }, + { + "epoch": 6.930121914956883, + "grad_norm": 1.2923803329467773, + "learning_rate": 3.0772889417360286e-06, + "loss": 0.2999, + "step": 2917 + }, + { + "epoch": 6.9325007433838834, + "grad_norm": 1.3863617181777954, + "learning_rate": 3.0749108204518436e-06, + "loss": 0.3279, + "step": 2918 + }, + { + "epoch": 6.9348795718108835, + "grad_norm": 1.531717300415039, + "learning_rate": 3.072532699167658e-06, + "loss": 0.3135, + "step": 2919 + }, + { + "epoch": 6.937258400237883, + "grad_norm": 1.5075002908706665, + "learning_rate": 3.0701545778834724e-06, + "loss": 0.3342, + "step": 2920 + }, + { + "epoch": 6.939637228664883, + "grad_norm": 1.2486562728881836, + "learning_rate": 3.0677764565992866e-06, + "loss": 0.3046, + "step": 2921 + }, + { + "epoch": 6.942016057091882, + "grad_norm": 1.5580554008483887, + "learning_rate": 3.0653983353151012e-06, + "loss": 0.323, + "step": 2922 + }, + { + "epoch": 6.944394885518882, + "grad_norm": 1.4383682012557983, + "learning_rate": 3.063020214030916e-06, + "loss": 0.3585, + "step": 2923 + }, + { + "epoch": 6.946773713945881, + "grad_norm": 1.4262577295303345, + "learning_rate": 3.06064209274673e-06, + "loss": 0.3768, + "step": 2924 + }, + { + "epoch": 6.9491525423728815, + "grad_norm": 1.4642927646636963, + "learning_rate": 3.058263971462545e-06, + "loss": 0.3132, + "step": 2925 + }, + { + "epoch": 6.951531370799881, + "grad_norm": 1.312036395072937, + "learning_rate": 3.0558858501783597e-06, + "loss": 0.2885, + "step": 2926 + }, + { + "epoch": 6.953910199226881, + "grad_norm": 1.3748372793197632, + "learning_rate": 3.053507728894174e-06, + "loss": 0.3362, + "step": 2927 + }, + { + "epoch": 6.956289027653881, + "grad_norm": 1.4912575483322144, + "learning_rate": 3.0511296076099885e-06, + "loss": 0.3145, + "step": 2928 + }, + { + "epoch": 6.95866785608088, + "grad_norm": 1.5249592065811157, + "learning_rate": 3.0487514863258027e-06, + "loss": 0.3227, + "step": 2929 + }, + { + "epoch": 6.96104668450788, + "grad_norm": 1.489377498626709, + "learning_rate": 3.0463733650416173e-06, + "loss": 0.3517, + "step": 2930 + }, + { + "epoch": 6.963425512934879, + "grad_norm": 1.4792976379394531, + "learning_rate": 3.0439952437574315e-06, + "loss": 0.3831, + "step": 2931 + }, + { + "epoch": 6.9658043413618795, + "grad_norm": 1.3129215240478516, + "learning_rate": 3.0416171224732465e-06, + "loss": 0.3572, + "step": 2932 + }, + { + "epoch": 6.968183169788879, + "grad_norm": 1.1984447240829468, + "learning_rate": 3.039239001189061e-06, + "loss": 0.3136, + "step": 2933 + }, + { + "epoch": 6.970561998215879, + "grad_norm": 1.5325055122375488, + "learning_rate": 3.0368608799048753e-06, + "loss": 0.3294, + "step": 2934 + }, + { + "epoch": 6.972940826642878, + "grad_norm": 1.4182255268096924, + "learning_rate": 3.03448275862069e-06, + "loss": 0.365, + "step": 2935 + }, + { + "epoch": 6.975319655069878, + "grad_norm": 1.2363041639328003, + "learning_rate": 3.032104637336504e-06, + "loss": 0.2995, + "step": 2936 + }, + { + "epoch": 6.977698483496878, + "grad_norm": 1.5297149419784546, + "learning_rate": 3.0297265160523187e-06, + "loss": 0.3846, + "step": 2937 + }, + { + "epoch": 6.9800773119238775, + "grad_norm": 1.4904123544692993, + "learning_rate": 3.0273483947681337e-06, + "loss": 0.3735, + "step": 2938 + }, + { + "epoch": 6.982456140350877, + "grad_norm": 1.4509968757629395, + "learning_rate": 3.024970273483948e-06, + "loss": 0.321, + "step": 2939 + }, + { + "epoch": 6.984834968777877, + "grad_norm": 1.4746577739715576, + "learning_rate": 3.0225921521997625e-06, + "loss": 0.3217, + "step": 2940 + }, + { + "epoch": 6.987213797204877, + "grad_norm": 1.4585851430892944, + "learning_rate": 3.020214030915577e-06, + "loss": 0.3336, + "step": 2941 + }, + { + "epoch": 6.989592625631876, + "grad_norm": 1.346763253211975, + "learning_rate": 3.0178359096313913e-06, + "loss": 0.3557, + "step": 2942 + }, + { + "epoch": 6.991971454058876, + "grad_norm": 1.4345506429672241, + "learning_rate": 3.015457788347206e-06, + "loss": 0.3277, + "step": 2943 + }, + { + "epoch": 6.994350282485875, + "grad_norm": 1.4565621614456177, + "learning_rate": 3.01307966706302e-06, + "loss": 0.3539, + "step": 2944 + }, + { + "epoch": 6.9967291109128755, + "grad_norm": 1.3110193014144897, + "learning_rate": 3.010701545778835e-06, + "loss": 0.339, + "step": 2945 + }, + { + "epoch": 6.999107939339875, + "grad_norm": 1.3658453226089478, + "learning_rate": 3.0083234244946498e-06, + "loss": 0.3221, + "step": 2946 + }, + { + "epoch": 7.0, + "grad_norm": 2.4225709438323975, + "learning_rate": 3.005945303210464e-06, + "loss": 0.2737, + "step": 2947 + }, + { + "epoch": 7.002378828427, + "grad_norm": 1.2866814136505127, + "learning_rate": 3.0035671819262786e-06, + "loss": 0.2933, + "step": 2948 + }, + { + "epoch": 7.004757656853999, + "grad_norm": 1.5019017457962036, + "learning_rate": 3.0011890606420928e-06, + "loss": 0.3665, + "step": 2949 + }, + { + "epoch": 7.007136485280999, + "grad_norm": 1.3421406745910645, + "learning_rate": 2.9988109393579074e-06, + "loss": 0.2877, + "step": 2950 + }, + { + "epoch": 7.007136485280999, + "eval_loss": 0.4234372675418854, + "eval_runtime": 24.0774, + "eval_samples_per_second": 31.066, + "eval_steps_per_second": 15.533, + "step": 2950 + }, + { + "epoch": 7.009515313707999, + "grad_norm": 1.3550580739974976, + "learning_rate": 2.9964328180737216e-06, + "loss": 0.2827, + "step": 2951 + }, + { + "epoch": 7.011894142134999, + "grad_norm": 1.664106011390686, + "learning_rate": 2.9940546967895366e-06, + "loss": 0.3905, + "step": 2952 + }, + { + "epoch": 7.014272970561998, + "grad_norm": 1.6251161098480225, + "learning_rate": 2.9916765755053512e-06, + "loss": 0.4013, + "step": 2953 + }, + { + "epoch": 7.016651798988998, + "grad_norm": 1.222933292388916, + "learning_rate": 2.9892984542211654e-06, + "loss": 0.2404, + "step": 2954 + }, + { + "epoch": 7.019030627415997, + "grad_norm": 1.4483827352523804, + "learning_rate": 2.98692033293698e-06, + "loss": 0.3867, + "step": 2955 + }, + { + "epoch": 7.021409455842997, + "grad_norm": 1.5734061002731323, + "learning_rate": 2.9845422116527946e-06, + "loss": 0.3984, + "step": 2956 + }, + { + "epoch": 7.0237882842699975, + "grad_norm": 1.3482481241226196, + "learning_rate": 2.982164090368609e-06, + "loss": 0.326, + "step": 2957 + }, + { + "epoch": 7.026167112696997, + "grad_norm": 1.45183527469635, + "learning_rate": 2.9797859690844234e-06, + "loss": 0.3114, + "step": 2958 + }, + { + "epoch": 7.028545941123997, + "grad_norm": 1.3493417501449585, + "learning_rate": 2.9774078478002385e-06, + "loss": 0.302, + "step": 2959 + }, + { + "epoch": 7.030924769550996, + "grad_norm": 1.277963638305664, + "learning_rate": 2.9750297265160526e-06, + "loss": 0.2841, + "step": 2960 + }, + { + "epoch": 7.033303597977996, + "grad_norm": 1.4931520223617554, + "learning_rate": 2.9726516052318673e-06, + "loss": 0.377, + "step": 2961 + }, + { + "epoch": 7.035682426404995, + "grad_norm": 1.5793704986572266, + "learning_rate": 2.9702734839476814e-06, + "loss": 0.2961, + "step": 2962 + }, + { + "epoch": 7.038061254831995, + "grad_norm": 1.3198587894439697, + "learning_rate": 2.967895362663496e-06, + "loss": 0.2749, + "step": 2963 + }, + { + "epoch": 7.040440083258995, + "grad_norm": 1.59401273727417, + "learning_rate": 2.9655172413793102e-06, + "loss": 0.3809, + "step": 2964 + }, + { + "epoch": 7.042818911685995, + "grad_norm": 1.2617006301879883, + "learning_rate": 2.963139120095125e-06, + "loss": 0.2876, + "step": 2965 + }, + { + "epoch": 7.045197740112994, + "grad_norm": 1.6442387104034424, + "learning_rate": 2.96076099881094e-06, + "loss": 0.4587, + "step": 2966 + }, + { + "epoch": 7.047576568539994, + "grad_norm": 1.2150065898895264, + "learning_rate": 2.958382877526754e-06, + "loss": 0.2856, + "step": 2967 + }, + { + "epoch": 7.049955396966994, + "grad_norm": 1.3880022764205933, + "learning_rate": 2.9560047562425687e-06, + "loss": 0.3103, + "step": 2968 + }, + { + "epoch": 7.052334225393993, + "grad_norm": 1.367879867553711, + "learning_rate": 2.9536266349583833e-06, + "loss": 0.3211, + "step": 2969 + }, + { + "epoch": 7.0547130538209935, + "grad_norm": 1.6197822093963623, + "learning_rate": 2.9512485136741975e-06, + "loss": 0.3873, + "step": 2970 + }, + { + "epoch": 7.057091882247993, + "grad_norm": 1.4599026441574097, + "learning_rate": 2.948870392390012e-06, + "loss": 0.3863, + "step": 2971 + }, + { + "epoch": 7.059470710674993, + "grad_norm": 1.4044595956802368, + "learning_rate": 2.9464922711058263e-06, + "loss": 0.3579, + "step": 2972 + }, + { + "epoch": 7.061849539101992, + "grad_norm": 1.4430410861968994, + "learning_rate": 2.9441141498216413e-06, + "loss": 0.3452, + "step": 2973 + }, + { + "epoch": 7.064228367528992, + "grad_norm": 1.3332663774490356, + "learning_rate": 2.941736028537456e-06, + "loss": 0.27, + "step": 2974 + }, + { + "epoch": 7.066607195955991, + "grad_norm": 1.3224964141845703, + "learning_rate": 2.93935790725327e-06, + "loss": 0.3643, + "step": 2975 + }, + { + "epoch": 7.068986024382991, + "grad_norm": 1.5350213050842285, + "learning_rate": 2.9369797859690847e-06, + "loss": 0.3339, + "step": 2976 + }, + { + "epoch": 7.0713648528099915, + "grad_norm": 1.2346463203430176, + "learning_rate": 2.934601664684899e-06, + "loss": 0.3369, + "step": 2977 + }, + { + "epoch": 7.073743681236991, + "grad_norm": 1.819899320602417, + "learning_rate": 2.9322235434007135e-06, + "loss": 0.4354, + "step": 2978 + }, + { + "epoch": 7.076122509663991, + "grad_norm": 1.511189341545105, + "learning_rate": 2.9298454221165286e-06, + "loss": 0.2971, + "step": 2979 + }, + { + "epoch": 7.07850133809099, + "grad_norm": 1.3977885246276855, + "learning_rate": 2.9274673008323427e-06, + "loss": 0.3046, + "step": 2980 + }, + { + "epoch": 7.08088016651799, + "grad_norm": 1.3867073059082031, + "learning_rate": 2.9250891795481574e-06, + "loss": 0.3448, + "step": 2981 + }, + { + "epoch": 7.083258994944989, + "grad_norm": 1.3060109615325928, + "learning_rate": 2.9227110582639715e-06, + "loss": 0.3372, + "step": 2982 + }, + { + "epoch": 7.085637823371989, + "grad_norm": 1.2630572319030762, + "learning_rate": 2.920332936979786e-06, + "loss": 0.2459, + "step": 2983 + }, + { + "epoch": 7.088016651798989, + "grad_norm": 1.5547776222229004, + "learning_rate": 2.9179548156956008e-06, + "loss": 0.3648, + "step": 2984 + }, + { + "epoch": 7.090395480225989, + "grad_norm": 1.2999500036239624, + "learning_rate": 2.915576694411415e-06, + "loss": 0.3631, + "step": 2985 + }, + { + "epoch": 7.092774308652988, + "grad_norm": 1.529556393623352, + "learning_rate": 2.91319857312723e-06, + "loss": 0.3358, + "step": 2986 + }, + { + "epoch": 7.095153137079988, + "grad_norm": 1.4695147275924683, + "learning_rate": 2.9108204518430446e-06, + "loss": 0.3587, + "step": 2987 + }, + { + "epoch": 7.097531965506988, + "grad_norm": 1.404646873474121, + "learning_rate": 2.9084423305588588e-06, + "loss": 0.3324, + "step": 2988 + }, + { + "epoch": 7.099910793933987, + "grad_norm": 1.3951613903045654, + "learning_rate": 2.9060642092746734e-06, + "loss": 0.3319, + "step": 2989 + }, + { + "epoch": 7.1022896223609875, + "grad_norm": 1.2933598756790161, + "learning_rate": 2.9036860879904876e-06, + "loss": 0.3304, + "step": 2990 + }, + { + "epoch": 7.104668450787987, + "grad_norm": 1.561038851737976, + "learning_rate": 2.901307966706302e-06, + "loss": 0.3504, + "step": 2991 + }, + { + "epoch": 7.107047279214987, + "grad_norm": 1.475908875465393, + "learning_rate": 2.8989298454221164e-06, + "loss": 0.3152, + "step": 2992 + }, + { + "epoch": 7.109426107641986, + "grad_norm": 1.5679782629013062, + "learning_rate": 2.8965517241379314e-06, + "loss": 0.346, + "step": 2993 + }, + { + "epoch": 7.111804936068986, + "grad_norm": 1.2477593421936035, + "learning_rate": 2.894173602853746e-06, + "loss": 0.2715, + "step": 2994 + }, + { + "epoch": 7.114183764495985, + "grad_norm": 1.2135212421417236, + "learning_rate": 2.8917954815695602e-06, + "loss": 0.2546, + "step": 2995 + }, + { + "epoch": 7.116562592922985, + "grad_norm": 1.4185711145401, + "learning_rate": 2.889417360285375e-06, + "loss": 0.2858, + "step": 2996 + }, + { + "epoch": 7.1189414213499855, + "grad_norm": 1.4968845844268799, + "learning_rate": 2.887039239001189e-06, + "loss": 0.3122, + "step": 2997 + }, + { + "epoch": 7.121320249776985, + "grad_norm": 1.4667471647262573, + "learning_rate": 2.8846611177170036e-06, + "loss": 0.3747, + "step": 2998 + }, + { + "epoch": 7.123699078203985, + "grad_norm": 1.4688721895217896, + "learning_rate": 2.8822829964328182e-06, + "loss": 0.3578, + "step": 2999 + }, + { + "epoch": 7.126077906630984, + "grad_norm": 1.6144357919692993, + "learning_rate": 2.879904875148633e-06, + "loss": 0.2834, + "step": 3000 + }, + { + "epoch": 7.126077906630984, + "eval_loss": 0.4251287877559662, + "eval_runtime": 23.878, + "eval_samples_per_second": 31.326, + "eval_steps_per_second": 15.663, + "step": 3000 + }, + { + "epoch": 7.128456735057984, + "grad_norm": 1.4574451446533203, + "learning_rate": 2.8775267538644475e-06, + "loss": 0.2856, + "step": 3001 + }, + { + "epoch": 7.130835563484983, + "grad_norm": 1.4845101833343506, + "learning_rate": 2.875148632580262e-06, + "loss": 0.3192, + "step": 3002 + }, + { + "epoch": 7.1332143919119835, + "grad_norm": 1.578880786895752, + "learning_rate": 2.8727705112960763e-06, + "loss": 0.3556, + "step": 3003 + }, + { + "epoch": 7.135593220338983, + "grad_norm": 1.5985043048858643, + "learning_rate": 2.870392390011891e-06, + "loss": 0.413, + "step": 3004 + }, + { + "epoch": 7.137972048765983, + "grad_norm": 1.555967926979065, + "learning_rate": 2.868014268727705e-06, + "loss": 0.2879, + "step": 3005 + }, + { + "epoch": 7.140350877192983, + "grad_norm": 1.3637062311172485, + "learning_rate": 2.8656361474435197e-06, + "loss": 0.3109, + "step": 3006 + }, + { + "epoch": 7.142729705619982, + "grad_norm": 1.32518470287323, + "learning_rate": 2.8632580261593347e-06, + "loss": 0.319, + "step": 3007 + }, + { + "epoch": 7.145108534046982, + "grad_norm": 1.510268211364746, + "learning_rate": 2.860879904875149e-06, + "loss": 0.3501, + "step": 3008 + }, + { + "epoch": 7.147487362473981, + "grad_norm": 1.5507092475891113, + "learning_rate": 2.8585017835909635e-06, + "loss": 0.3624, + "step": 3009 + }, + { + "epoch": 7.1498661909009815, + "grad_norm": 1.4813852310180664, + "learning_rate": 2.8561236623067777e-06, + "loss": 0.3512, + "step": 3010 + }, + { + "epoch": 7.152245019327981, + "grad_norm": 1.50169837474823, + "learning_rate": 2.8537455410225923e-06, + "loss": 0.2922, + "step": 3011 + }, + { + "epoch": 7.154623847754981, + "grad_norm": 1.6155248880386353, + "learning_rate": 2.851367419738407e-06, + "loss": 0.3571, + "step": 3012 + }, + { + "epoch": 7.15700267618198, + "grad_norm": 1.3408325910568237, + "learning_rate": 2.8489892984542215e-06, + "loss": 0.3294, + "step": 3013 + }, + { + "epoch": 7.15938150460898, + "grad_norm": 1.6139357089996338, + "learning_rate": 2.846611177170036e-06, + "loss": 0.2475, + "step": 3014 + }, + { + "epoch": 7.161760333035979, + "grad_norm": 1.3346033096313477, + "learning_rate": 2.8442330558858503e-06, + "loss": 0.3054, + "step": 3015 + }, + { + "epoch": 7.1641391614629795, + "grad_norm": 1.6061490774154663, + "learning_rate": 2.841854934601665e-06, + "loss": 0.3869, + "step": 3016 + }, + { + "epoch": 7.16651798988998, + "grad_norm": 1.4216632843017578, + "learning_rate": 2.8394768133174795e-06, + "loss": 0.3166, + "step": 3017 + }, + { + "epoch": 7.168896818316979, + "grad_norm": 1.564766526222229, + "learning_rate": 2.8370986920332937e-06, + "loss": 0.2857, + "step": 3018 + }, + { + "epoch": 7.171275646743979, + "grad_norm": 1.4604299068450928, + "learning_rate": 2.8347205707491083e-06, + "loss": 0.3676, + "step": 3019 + }, + { + "epoch": 7.173654475170978, + "grad_norm": 1.2607996463775635, + "learning_rate": 2.8323424494649234e-06, + "loss": 0.2956, + "step": 3020 + }, + { + "epoch": 7.176033303597978, + "grad_norm": 1.4073858261108398, + "learning_rate": 2.8299643281807376e-06, + "loss": 0.3141, + "step": 3021 + }, + { + "epoch": 7.178412132024977, + "grad_norm": 1.2486492395401, + "learning_rate": 2.827586206896552e-06, + "loss": 0.3128, + "step": 3022 + }, + { + "epoch": 7.1807909604519775, + "grad_norm": 1.4238802194595337, + "learning_rate": 2.8252080856123664e-06, + "loss": 0.3074, + "step": 3023 + }, + { + "epoch": 7.183169788878977, + "grad_norm": 1.3892570734024048, + "learning_rate": 2.822829964328181e-06, + "loss": 0.3057, + "step": 3024 + }, + { + "epoch": 7.185548617305977, + "grad_norm": 1.5374196767807007, + "learning_rate": 2.820451843043995e-06, + "loss": 0.333, + "step": 3025 + }, + { + "epoch": 7.187927445732977, + "grad_norm": 1.390607237815857, + "learning_rate": 2.8180737217598098e-06, + "loss": 0.3108, + "step": 3026 + }, + { + "epoch": 7.190306274159976, + "grad_norm": 1.4916608333587646, + "learning_rate": 2.815695600475625e-06, + "loss": 0.3357, + "step": 3027 + }, + { + "epoch": 7.192685102586976, + "grad_norm": 1.4379554986953735, + "learning_rate": 2.813317479191439e-06, + "loss": 0.3077, + "step": 3028 + }, + { + "epoch": 7.1950639310139755, + "grad_norm": 1.2366918325424194, + "learning_rate": 2.8109393579072536e-06, + "loss": 0.321, + "step": 3029 + }, + { + "epoch": 7.197442759440976, + "grad_norm": 1.3501452207565308, + "learning_rate": 2.808561236623068e-06, + "loss": 0.3238, + "step": 3030 + }, + { + "epoch": 7.199821587867975, + "grad_norm": 1.5397756099700928, + "learning_rate": 2.8061831153388824e-06, + "loss": 0.3735, + "step": 3031 + }, + { + "epoch": 7.202200416294975, + "grad_norm": 1.3461271524429321, + "learning_rate": 2.803804994054697e-06, + "loss": 0.3124, + "step": 3032 + }, + { + "epoch": 7.204579244721974, + "grad_norm": 1.485197901725769, + "learning_rate": 2.801426872770511e-06, + "loss": 0.3975, + "step": 3033 + }, + { + "epoch": 7.206958073148974, + "grad_norm": 1.6976451873779297, + "learning_rate": 2.7990487514863262e-06, + "loss": 0.343, + "step": 3034 + }, + { + "epoch": 7.209336901575973, + "grad_norm": 1.5384577512741089, + "learning_rate": 2.796670630202141e-06, + "loss": 0.351, + "step": 3035 + }, + { + "epoch": 7.2117157300029735, + "grad_norm": 1.46756911277771, + "learning_rate": 2.794292508917955e-06, + "loss": 0.3484, + "step": 3036 + }, + { + "epoch": 7.214094558429974, + "grad_norm": 1.3909190893173218, + "learning_rate": 2.7919143876337696e-06, + "loss": 0.3633, + "step": 3037 + }, + { + "epoch": 7.216473386856973, + "grad_norm": 1.5076216459274292, + "learning_rate": 2.789536266349584e-06, + "loss": 0.3175, + "step": 3038 + }, + { + "epoch": 7.218852215283973, + "grad_norm": 1.3265541791915894, + "learning_rate": 2.7871581450653984e-06, + "loss": 0.3581, + "step": 3039 + }, + { + "epoch": 7.221231043710972, + "grad_norm": 1.382404088973999, + "learning_rate": 2.7847800237812126e-06, + "loss": 0.3152, + "step": 3040 + }, + { + "epoch": 7.223609872137972, + "grad_norm": 1.4456698894500732, + "learning_rate": 2.7824019024970277e-06, + "loss": 0.333, + "step": 3041 + }, + { + "epoch": 7.2259887005649714, + "grad_norm": 1.527531385421753, + "learning_rate": 2.7800237812128423e-06, + "loss": 0.3664, + "step": 3042 + }, + { + "epoch": 7.2283675289919715, + "grad_norm": 1.5032025575637817, + "learning_rate": 2.7776456599286565e-06, + "loss": 0.2875, + "step": 3043 + }, + { + "epoch": 7.230746357418971, + "grad_norm": 1.3119163513183594, + "learning_rate": 2.775267538644471e-06, + "loss": 0.2888, + "step": 3044 + }, + { + "epoch": 7.233125185845971, + "grad_norm": 1.4871104955673218, + "learning_rate": 2.7728894173602857e-06, + "loss": 0.2714, + "step": 3045 + }, + { + "epoch": 7.235504014272971, + "grad_norm": 1.4243842363357544, + "learning_rate": 2.7705112960761e-06, + "loss": 0.3921, + "step": 3046 + }, + { + "epoch": 7.23788284269997, + "grad_norm": 1.574776291847229, + "learning_rate": 2.7681331747919145e-06, + "loss": 0.312, + "step": 3047 + }, + { + "epoch": 7.24026167112697, + "grad_norm": 1.443030595779419, + "learning_rate": 2.7657550535077295e-06, + "loss": 0.3224, + "step": 3048 + }, + { + "epoch": 7.2426404995539695, + "grad_norm": 1.4133692979812622, + "learning_rate": 2.7633769322235437e-06, + "loss": 0.3385, + "step": 3049 + }, + { + "epoch": 7.24501932798097, + "grad_norm": 1.4445950984954834, + "learning_rate": 2.7609988109393583e-06, + "loss": 0.3466, + "step": 3050 + }, + { + "epoch": 7.24501932798097, + "eval_loss": 0.4258868396282196, + "eval_runtime": 23.3759, + "eval_samples_per_second": 31.999, + "eval_steps_per_second": 15.999, + "step": 3050 + }, + { + "epoch": 7.247398156407969, + "grad_norm": 1.4001848697662354, + "learning_rate": 2.7586206896551725e-06, + "loss": 0.3152, + "step": 3051 + }, + { + "epoch": 7.249776984834969, + "grad_norm": 1.52290940284729, + "learning_rate": 2.756242568370987e-06, + "loss": 0.3584, + "step": 3052 + }, + { + "epoch": 7.252155813261968, + "grad_norm": 1.4810575246810913, + "learning_rate": 2.7538644470868013e-06, + "loss": 0.3742, + "step": 3053 + }, + { + "epoch": 7.254534641688968, + "grad_norm": 1.4414474964141846, + "learning_rate": 2.7514863258026163e-06, + "loss": 0.3669, + "step": 3054 + }, + { + "epoch": 7.256913470115968, + "grad_norm": 1.486236810684204, + "learning_rate": 2.749108204518431e-06, + "loss": 0.3071, + "step": 3055 + }, + { + "epoch": 7.2592922985429675, + "grad_norm": 1.3205069303512573, + "learning_rate": 2.746730083234245e-06, + "loss": 0.2627, + "step": 3056 + }, + { + "epoch": 7.261671126969968, + "grad_norm": 1.444412112236023, + "learning_rate": 2.7443519619500597e-06, + "loss": 0.3323, + "step": 3057 + }, + { + "epoch": 7.264049955396967, + "grad_norm": 1.6824218034744263, + "learning_rate": 2.741973840665874e-06, + "loss": 0.3581, + "step": 3058 + }, + { + "epoch": 7.266428783823967, + "grad_norm": 1.4174275398254395, + "learning_rate": 2.7395957193816885e-06, + "loss": 0.2757, + "step": 3059 + }, + { + "epoch": 7.268807612250966, + "grad_norm": 1.4983530044555664, + "learning_rate": 2.737217598097503e-06, + "loss": 0.3431, + "step": 3060 + }, + { + "epoch": 7.271186440677966, + "grad_norm": 1.4670859575271606, + "learning_rate": 2.7348394768133178e-06, + "loss": 0.3573, + "step": 3061 + }, + { + "epoch": 7.2735652691049655, + "grad_norm": 1.3930273056030273, + "learning_rate": 2.7324613555291324e-06, + "loss": 0.3488, + "step": 3062 + }, + { + "epoch": 7.275944097531966, + "grad_norm": 1.6817212104797363, + "learning_rate": 2.730083234244947e-06, + "loss": 0.4097, + "step": 3063 + }, + { + "epoch": 7.278322925958965, + "grad_norm": 1.3912428617477417, + "learning_rate": 2.727705112960761e-06, + "loss": 0.297, + "step": 3064 + }, + { + "epoch": 7.280701754385965, + "grad_norm": 1.3886240720748901, + "learning_rate": 2.7253269916765758e-06, + "loss": 0.3187, + "step": 3065 + }, + { + "epoch": 7.283080582812965, + "grad_norm": 1.3610830307006836, + "learning_rate": 2.72294887039239e-06, + "loss": 0.317, + "step": 3066 + }, + { + "epoch": 7.285459411239964, + "grad_norm": 1.3623230457305908, + "learning_rate": 2.7205707491082046e-06, + "loss": 0.331, + "step": 3067 + }, + { + "epoch": 7.287838239666964, + "grad_norm": 1.5625314712524414, + "learning_rate": 2.7181926278240196e-06, + "loss": 0.3755, + "step": 3068 + }, + { + "epoch": 7.2902170680939635, + "grad_norm": 1.4710798263549805, + "learning_rate": 2.715814506539834e-06, + "loss": 0.3602, + "step": 3069 + }, + { + "epoch": 7.292595896520964, + "grad_norm": 1.2548584938049316, + "learning_rate": 2.7134363852556484e-06, + "loss": 0.3693, + "step": 3070 + }, + { + "epoch": 7.294974724947963, + "grad_norm": 1.384434461593628, + "learning_rate": 2.7110582639714626e-06, + "loss": 0.3173, + "step": 3071 + }, + { + "epoch": 7.297353553374963, + "grad_norm": 1.445136308670044, + "learning_rate": 2.708680142687277e-06, + "loss": 0.3534, + "step": 3072 + }, + { + "epoch": 7.299732381801962, + "grad_norm": 1.3579226732254028, + "learning_rate": 2.706302021403092e-06, + "loss": 0.3013, + "step": 3073 + }, + { + "epoch": 7.302111210228962, + "grad_norm": 1.283040165901184, + "learning_rate": 2.703923900118906e-06, + "loss": 0.3014, + "step": 3074 + }, + { + "epoch": 7.3044900386559615, + "grad_norm": 1.6444274187088013, + "learning_rate": 2.701545778834721e-06, + "loss": 0.3277, + "step": 3075 + }, + { + "epoch": 7.306868867082962, + "grad_norm": 1.4135137796401978, + "learning_rate": 2.6991676575505356e-06, + "loss": 0.3697, + "step": 3076 + }, + { + "epoch": 7.309247695509962, + "grad_norm": 1.391108751296997, + "learning_rate": 2.69678953626635e-06, + "loss": 0.3285, + "step": 3077 + }, + { + "epoch": 7.311626523936961, + "grad_norm": 1.3016142845153809, + "learning_rate": 2.6944114149821644e-06, + "loss": 0.3585, + "step": 3078 + }, + { + "epoch": 7.314005352363961, + "grad_norm": 1.2839300632476807, + "learning_rate": 2.6920332936979786e-06, + "loss": 0.3307, + "step": 3079 + }, + { + "epoch": 7.31638418079096, + "grad_norm": 1.1886889934539795, + "learning_rate": 2.6896551724137932e-06, + "loss": 0.2772, + "step": 3080 + }, + { + "epoch": 7.31876300921796, + "grad_norm": 1.5614640712738037, + "learning_rate": 2.6872770511296074e-06, + "loss": 0.3595, + "step": 3081 + }, + { + "epoch": 7.3211418376449595, + "grad_norm": 1.2366526126861572, + "learning_rate": 2.6848989298454225e-06, + "loss": 0.2865, + "step": 3082 + }, + { + "epoch": 7.32352066607196, + "grad_norm": 1.770639419555664, + "learning_rate": 2.682520808561237e-06, + "loss": 0.3344, + "step": 3083 + }, + { + "epoch": 7.325899494498959, + "grad_norm": 1.4296238422393799, + "learning_rate": 2.6801426872770513e-06, + "loss": 0.3101, + "step": 3084 + }, + { + "epoch": 7.328278322925959, + "grad_norm": 1.422737956047058, + "learning_rate": 2.677764565992866e-06, + "loss": 0.2765, + "step": 3085 + }, + { + "epoch": 7.330657151352959, + "grad_norm": 1.3920886516571045, + "learning_rate": 2.67538644470868e-06, + "loss": 0.3416, + "step": 3086 + }, + { + "epoch": 7.333035979779958, + "grad_norm": 1.7140244245529175, + "learning_rate": 2.6730083234244947e-06, + "loss": 0.3364, + "step": 3087 + }, + { + "epoch": 7.335414808206958, + "grad_norm": 1.4303035736083984, + "learning_rate": 2.6706302021403093e-06, + "loss": 0.308, + "step": 3088 + }, + { + "epoch": 7.337793636633958, + "grad_norm": 1.5629559755325317, + "learning_rate": 2.668252080856124e-06, + "loss": 0.3108, + "step": 3089 + }, + { + "epoch": 7.340172465060958, + "grad_norm": 1.3572946786880493, + "learning_rate": 2.6658739595719385e-06, + "loss": 0.3004, + "step": 3090 + }, + { + "epoch": 7.342551293487957, + "grad_norm": 1.4875978231430054, + "learning_rate": 2.663495838287753e-06, + "loss": 0.367, + "step": 3091 + }, + { + "epoch": 7.344930121914957, + "grad_norm": 1.4923677444458008, + "learning_rate": 2.6611177170035673e-06, + "loss": 0.3792, + "step": 3092 + }, + { + "epoch": 7.347308950341956, + "grad_norm": 1.6642735004425049, + "learning_rate": 2.658739595719382e-06, + "loss": 0.386, + "step": 3093 + }, + { + "epoch": 7.349687778768956, + "grad_norm": 1.4917572736740112, + "learning_rate": 2.656361474435196e-06, + "loss": 0.359, + "step": 3094 + }, + { + "epoch": 7.352066607195956, + "grad_norm": 1.5021194219589233, + "learning_rate": 2.653983353151011e-06, + "loss": 0.3124, + "step": 3095 + }, + { + "epoch": 7.354445435622956, + "grad_norm": 1.6576999425888062, + "learning_rate": 2.6516052318668257e-06, + "loss": 0.4627, + "step": 3096 + }, + { + "epoch": 7.356824264049956, + "grad_norm": 1.626568078994751, + "learning_rate": 2.64922711058264e-06, + "loss": 0.3602, + "step": 3097 + }, + { + "epoch": 7.359203092476955, + "grad_norm": 1.5944223403930664, + "learning_rate": 2.6468489892984545e-06, + "loss": 0.3794, + "step": 3098 + }, + { + "epoch": 7.361581920903955, + "grad_norm": 1.6035648584365845, + "learning_rate": 2.6444708680142687e-06, + "loss": 0.3576, + "step": 3099 + }, + { + "epoch": 7.363960749330954, + "grad_norm": 1.4863388538360596, + "learning_rate": 2.6420927467300833e-06, + "loss": 0.3802, + "step": 3100 + }, + { + "epoch": 7.363960749330954, + "eval_loss": 0.42565590143203735, + "eval_runtime": 23.1978, + "eval_samples_per_second": 32.244, + "eval_steps_per_second": 16.122, + "step": 3100 + }, + { + "epoch": 7.366339577757954, + "grad_norm": 1.264291763305664, + "learning_rate": 2.6397146254458975e-06, + "loss": 0.3156, + "step": 3101 + }, + { + "epoch": 7.3687184061849536, + "grad_norm": 1.4825289249420166, + "learning_rate": 2.6373365041617126e-06, + "loss": 0.3571, + "step": 3102 + }, + { + "epoch": 7.371097234611954, + "grad_norm": 1.4485363960266113, + "learning_rate": 2.634958382877527e-06, + "loss": 0.3435, + "step": 3103 + }, + { + "epoch": 7.373476063038954, + "grad_norm": 1.369432806968689, + "learning_rate": 2.6325802615933414e-06, + "loss": 0.2977, + "step": 3104 + }, + { + "epoch": 7.375854891465953, + "grad_norm": 1.3378500938415527, + "learning_rate": 2.630202140309156e-06, + "loss": 0.3159, + "step": 3105 + }, + { + "epoch": 7.378233719892953, + "grad_norm": 1.4564255475997925, + "learning_rate": 2.6278240190249706e-06, + "loss": 0.379, + "step": 3106 + }, + { + "epoch": 7.380612548319952, + "grad_norm": 1.5862705707550049, + "learning_rate": 2.6254458977407848e-06, + "loss": 0.3405, + "step": 3107 + }, + { + "epoch": 7.382991376746952, + "grad_norm": 1.542419195175171, + "learning_rate": 2.6230677764565994e-06, + "loss": 0.3114, + "step": 3108 + }, + { + "epoch": 7.385370205173952, + "grad_norm": 1.5399863719940186, + "learning_rate": 2.6206896551724144e-06, + "loss": 0.3545, + "step": 3109 + }, + { + "epoch": 7.387749033600952, + "grad_norm": 1.3577919006347656, + "learning_rate": 2.6183115338882286e-06, + "loss": 0.2718, + "step": 3110 + }, + { + "epoch": 7.390127862027951, + "grad_norm": 1.499398112297058, + "learning_rate": 2.6159334126040432e-06, + "loss": 0.3681, + "step": 3111 + }, + { + "epoch": 7.392506690454951, + "grad_norm": 1.232123613357544, + "learning_rate": 2.6135552913198574e-06, + "loss": 0.2727, + "step": 3112 + }, + { + "epoch": 7.39488551888195, + "grad_norm": 1.524108648300171, + "learning_rate": 2.611177170035672e-06, + "loss": 0.2569, + "step": 3113 + }, + { + "epoch": 7.39726434730895, + "grad_norm": 1.3201324939727783, + "learning_rate": 2.608799048751486e-06, + "loss": 0.2598, + "step": 3114 + }, + { + "epoch": 7.39964317573595, + "grad_norm": 1.4778393507003784, + "learning_rate": 2.606420927467301e-06, + "loss": 0.3292, + "step": 3115 + }, + { + "epoch": 7.40202200416295, + "grad_norm": 1.483357310295105, + "learning_rate": 2.604042806183116e-06, + "loss": 0.3369, + "step": 3116 + }, + { + "epoch": 7.40440083258995, + "grad_norm": 1.3852593898773193, + "learning_rate": 2.60166468489893e-06, + "loss": 0.3335, + "step": 3117 + }, + { + "epoch": 7.406779661016949, + "grad_norm": 1.6114912033081055, + "learning_rate": 2.5992865636147446e-06, + "loss": 0.3756, + "step": 3118 + }, + { + "epoch": 7.409158489443949, + "grad_norm": 1.5122921466827393, + "learning_rate": 2.5969084423305593e-06, + "loss": 0.3203, + "step": 3119 + }, + { + "epoch": 7.411537317870948, + "grad_norm": 1.6318726539611816, + "learning_rate": 2.5945303210463734e-06, + "loss": 0.3486, + "step": 3120 + }, + { + "epoch": 7.413916146297948, + "grad_norm": 1.696450114250183, + "learning_rate": 2.592152199762188e-06, + "loss": 0.3087, + "step": 3121 + }, + { + "epoch": 7.416294974724948, + "grad_norm": 1.4736454486846924, + "learning_rate": 2.5897740784780022e-06, + "loss": 0.3349, + "step": 3122 + }, + { + "epoch": 7.418673803151948, + "grad_norm": 1.5422126054763794, + "learning_rate": 2.5873959571938173e-06, + "loss": 0.3393, + "step": 3123 + }, + { + "epoch": 7.421052631578947, + "grad_norm": 1.5082223415374756, + "learning_rate": 2.585017835909632e-06, + "loss": 0.3085, + "step": 3124 + }, + { + "epoch": 7.423431460005947, + "grad_norm": 1.5170543193817139, + "learning_rate": 2.582639714625446e-06, + "loss": 0.3423, + "step": 3125 + }, + { + "epoch": 7.425810288432947, + "grad_norm": 1.5368679761886597, + "learning_rate": 2.5802615933412607e-06, + "loss": 0.3615, + "step": 3126 + }, + { + "epoch": 7.428189116859946, + "grad_norm": 1.477596402168274, + "learning_rate": 2.577883472057075e-06, + "loss": 0.3732, + "step": 3127 + }, + { + "epoch": 7.430567945286946, + "grad_norm": 1.4233729839324951, + "learning_rate": 2.5755053507728895e-06, + "loss": 0.3046, + "step": 3128 + }, + { + "epoch": 7.432946773713946, + "grad_norm": 1.5166014432907104, + "learning_rate": 2.5731272294887037e-06, + "loss": 0.3316, + "step": 3129 + }, + { + "epoch": 7.435325602140946, + "grad_norm": 1.646687626838684, + "learning_rate": 2.5707491082045187e-06, + "loss": 0.313, + "step": 3130 + }, + { + "epoch": 7.437704430567945, + "grad_norm": 1.5180085897445679, + "learning_rate": 2.5683709869203333e-06, + "loss": 0.3853, + "step": 3131 + }, + { + "epoch": 7.440083258994945, + "grad_norm": 1.4314266443252563, + "learning_rate": 2.5659928656361475e-06, + "loss": 0.2673, + "step": 3132 + }, + { + "epoch": 7.442462087421944, + "grad_norm": 1.4579449892044067, + "learning_rate": 2.563614744351962e-06, + "loss": 0.3565, + "step": 3133 + }, + { + "epoch": 7.444840915848944, + "grad_norm": 1.7134509086608887, + "learning_rate": 2.5612366230677767e-06, + "loss": 0.4243, + "step": 3134 + }, + { + "epoch": 7.4472197442759445, + "grad_norm": 1.3110231161117554, + "learning_rate": 2.558858501783591e-06, + "loss": 0.3033, + "step": 3135 + }, + { + "epoch": 7.449598572702944, + "grad_norm": 1.420120358467102, + "learning_rate": 2.556480380499406e-06, + "loss": 0.3585, + "step": 3136 + }, + { + "epoch": 7.451977401129944, + "grad_norm": 1.4151333570480347, + "learning_rate": 2.5541022592152206e-06, + "loss": 0.3618, + "step": 3137 + }, + { + "epoch": 7.454356229556943, + "grad_norm": 1.4592183828353882, + "learning_rate": 2.5517241379310347e-06, + "loss": 0.2961, + "step": 3138 + }, + { + "epoch": 7.456735057983943, + "grad_norm": 1.4459141492843628, + "learning_rate": 2.5493460166468494e-06, + "loss": 0.3715, + "step": 3139 + }, + { + "epoch": 7.459113886410942, + "grad_norm": 1.6419025659561157, + "learning_rate": 2.5469678953626635e-06, + "loss": 0.304, + "step": 3140 + }, + { + "epoch": 7.461492714837942, + "grad_norm": 1.5064197778701782, + "learning_rate": 2.544589774078478e-06, + "loss": 0.3164, + "step": 3141 + }, + { + "epoch": 7.463871543264942, + "grad_norm": 1.523261308670044, + "learning_rate": 2.5422116527942923e-06, + "loss": 0.3506, + "step": 3142 + }, + { + "epoch": 7.466250371691942, + "grad_norm": 1.50393545627594, + "learning_rate": 2.5398335315101074e-06, + "loss": 0.317, + "step": 3143 + }, + { + "epoch": 7.468629200118942, + "grad_norm": 1.5082001686096191, + "learning_rate": 2.537455410225922e-06, + "loss": 0.374, + "step": 3144 + }, + { + "epoch": 7.471008028545941, + "grad_norm": 1.4517046213150024, + "learning_rate": 2.535077288941736e-06, + "loss": 0.3985, + "step": 3145 + }, + { + "epoch": 7.473386856972941, + "grad_norm": 1.3499882221221924, + "learning_rate": 2.5326991676575508e-06, + "loss": 0.3437, + "step": 3146 + }, + { + "epoch": 7.47576568539994, + "grad_norm": 1.4982030391693115, + "learning_rate": 2.530321046373365e-06, + "loss": 0.28, + "step": 3147 + }, + { + "epoch": 7.4781445138269405, + "grad_norm": 1.308917760848999, + "learning_rate": 2.5279429250891796e-06, + "loss": 0.325, + "step": 3148 + }, + { + "epoch": 7.48052334225394, + "grad_norm": 1.4385558366775513, + "learning_rate": 2.525564803804994e-06, + "loss": 0.3254, + "step": 3149 + }, + { + "epoch": 7.48290217068094, + "grad_norm": 1.3092890977859497, + "learning_rate": 2.523186682520809e-06, + "loss": 0.3337, + "step": 3150 + }, + { + "epoch": 7.48290217068094, + "eval_loss": 0.4242006242275238, + "eval_runtime": 23.0744, + "eval_samples_per_second": 32.417, + "eval_steps_per_second": 16.208, + "step": 3150 + }, + { + "epoch": 7.485280999107939, + "grad_norm": 1.2962702512741089, + "learning_rate": 2.5208085612366234e-06, + "loss": 0.2915, + "step": 3151 + }, + { + "epoch": 7.487659827534939, + "grad_norm": 1.5002399682998657, + "learning_rate": 2.518430439952438e-06, + "loss": 0.2673, + "step": 3152 + }, + { + "epoch": 7.490038655961939, + "grad_norm": 1.6173349618911743, + "learning_rate": 2.5160523186682522e-06, + "loss": 0.353, + "step": 3153 + }, + { + "epoch": 7.492417484388938, + "grad_norm": 1.698455810546875, + "learning_rate": 2.513674197384067e-06, + "loss": 0.3176, + "step": 3154 + }, + { + "epoch": 7.4947963128159385, + "grad_norm": 1.5773831605911255, + "learning_rate": 2.511296076099881e-06, + "loss": 0.4062, + "step": 3155 + }, + { + "epoch": 7.497175141242938, + "grad_norm": 1.3851008415222168, + "learning_rate": 2.5089179548156956e-06, + "loss": 0.2974, + "step": 3156 + }, + { + "epoch": 7.499553969669938, + "grad_norm": 1.3688408136367798, + "learning_rate": 2.5065398335315107e-06, + "loss": 0.3291, + "step": 3157 + }, + { + "epoch": 7.501932798096937, + "grad_norm": 1.4578022956848145, + "learning_rate": 2.504161712247325e-06, + "loss": 0.299, + "step": 3158 + }, + { + "epoch": 7.504311626523937, + "grad_norm": 1.3130944967269897, + "learning_rate": 2.5017835909631395e-06, + "loss": 0.3466, + "step": 3159 + }, + { + "epoch": 7.506690454950936, + "grad_norm": 1.4340604543685913, + "learning_rate": 2.4994054696789536e-06, + "loss": 0.3373, + "step": 3160 + }, + { + "epoch": 7.509069283377936, + "grad_norm": 1.4963853359222412, + "learning_rate": 2.4970273483947683e-06, + "loss": 0.3586, + "step": 3161 + }, + { + "epoch": 7.5114481118049365, + "grad_norm": 1.5590804815292358, + "learning_rate": 2.494649227110583e-06, + "loss": 0.3111, + "step": 3162 + }, + { + "epoch": 7.513826940231936, + "grad_norm": 1.5240285396575928, + "learning_rate": 2.4922711058263975e-06, + "loss": 0.304, + "step": 3163 + }, + { + "epoch": 7.516205768658936, + "grad_norm": 1.4269561767578125, + "learning_rate": 2.4898929845422117e-06, + "loss": 0.2915, + "step": 3164 + }, + { + "epoch": 7.518584597085935, + "grad_norm": 1.5887725353240967, + "learning_rate": 2.4875148632580267e-06, + "loss": 0.3487, + "step": 3165 + }, + { + "epoch": 7.520963425512935, + "grad_norm": 1.3271087408065796, + "learning_rate": 2.485136741973841e-06, + "loss": 0.3219, + "step": 3166 + }, + { + "epoch": 7.523342253939934, + "grad_norm": 1.5539592504501343, + "learning_rate": 2.4827586206896555e-06, + "loss": 0.3245, + "step": 3167 + }, + { + "epoch": 7.5257210823669345, + "grad_norm": 1.5467103719711304, + "learning_rate": 2.4803804994054697e-06, + "loss": 0.3499, + "step": 3168 + }, + { + "epoch": 7.528099910793934, + "grad_norm": 1.5960679054260254, + "learning_rate": 2.4780023781212843e-06, + "loss": 0.3481, + "step": 3169 + }, + { + "epoch": 7.530478739220934, + "grad_norm": 1.4572833776474, + "learning_rate": 2.475624256837099e-06, + "loss": 0.3781, + "step": 3170 + }, + { + "epoch": 7.532857567647933, + "grad_norm": 1.614854335784912, + "learning_rate": 2.4732461355529135e-06, + "loss": 0.4095, + "step": 3171 + }, + { + "epoch": 7.535236396074933, + "grad_norm": 1.2329866886138916, + "learning_rate": 2.470868014268728e-06, + "loss": 0.3011, + "step": 3172 + }, + { + "epoch": 7.537615224501932, + "grad_norm": 1.4653210639953613, + "learning_rate": 2.4684898929845423e-06, + "loss": 0.2891, + "step": 3173 + }, + { + "epoch": 7.539994052928932, + "grad_norm": 1.5144091844558716, + "learning_rate": 2.466111771700357e-06, + "loss": 0.3563, + "step": 3174 + }, + { + "epoch": 7.5423728813559325, + "grad_norm": 1.541222333908081, + "learning_rate": 2.463733650416171e-06, + "loss": 0.3706, + "step": 3175 + }, + { + "epoch": 7.544751709782932, + "grad_norm": 1.293131709098816, + "learning_rate": 2.461355529131986e-06, + "loss": 0.3278, + "step": 3176 + }, + { + "epoch": 7.547130538209932, + "grad_norm": 1.486072301864624, + "learning_rate": 2.4589774078478003e-06, + "loss": 0.3084, + "step": 3177 + }, + { + "epoch": 7.549509366636931, + "grad_norm": 1.5016642808914185, + "learning_rate": 2.456599286563615e-06, + "loss": 0.3752, + "step": 3178 + }, + { + "epoch": 7.551888195063931, + "grad_norm": 1.5114887952804565, + "learning_rate": 2.4542211652794296e-06, + "loss": 0.3088, + "step": 3179 + }, + { + "epoch": 7.55426702349093, + "grad_norm": 1.6318726539611816, + "learning_rate": 2.451843043995244e-06, + "loss": 0.3201, + "step": 3180 + }, + { + "epoch": 7.5566458519179305, + "grad_norm": 1.4275195598602295, + "learning_rate": 2.4494649227110584e-06, + "loss": 0.3258, + "step": 3181 + }, + { + "epoch": 7.55902468034493, + "grad_norm": 1.525590181350708, + "learning_rate": 2.447086801426873e-06, + "loss": 0.3356, + "step": 3182 + }, + { + "epoch": 7.56140350877193, + "grad_norm": 1.6062564849853516, + "learning_rate": 2.4447086801426876e-06, + "loss": 0.3272, + "step": 3183 + }, + { + "epoch": 7.56378233719893, + "grad_norm": 1.3662217855453491, + "learning_rate": 2.4423305588585018e-06, + "loss": 0.3305, + "step": 3184 + }, + { + "epoch": 7.566161165625929, + "grad_norm": 1.3494327068328857, + "learning_rate": 2.4399524375743164e-06, + "loss": 0.2795, + "step": 3185 + }, + { + "epoch": 7.568539994052929, + "grad_norm": 1.4314560890197754, + "learning_rate": 2.437574316290131e-06, + "loss": 0.3101, + "step": 3186 + }, + { + "epoch": 7.570918822479928, + "grad_norm": 1.456465482711792, + "learning_rate": 2.4351961950059456e-06, + "loss": 0.3772, + "step": 3187 + }, + { + "epoch": 7.5732976509069285, + "grad_norm": 1.3630119562149048, + "learning_rate": 2.4328180737217598e-06, + "loss": 0.2738, + "step": 3188 + }, + { + "epoch": 7.575676479333928, + "grad_norm": 1.4903489351272583, + "learning_rate": 2.430439952437575e-06, + "loss": 0.391, + "step": 3189 + }, + { + "epoch": 7.578055307760928, + "grad_norm": 1.5412935018539429, + "learning_rate": 2.428061831153389e-06, + "loss": 0.3331, + "step": 3190 + }, + { + "epoch": 7.580434136187927, + "grad_norm": 1.4487465620040894, + "learning_rate": 2.4256837098692036e-06, + "loss": 0.3105, + "step": 3191 + }, + { + "epoch": 7.582812964614927, + "grad_norm": 1.4419939517974854, + "learning_rate": 2.423305588585018e-06, + "loss": 0.3513, + "step": 3192 + }, + { + "epoch": 7.585191793041927, + "grad_norm": 1.6065683364868164, + "learning_rate": 2.4209274673008324e-06, + "loss": 0.3864, + "step": 3193 + }, + { + "epoch": 7.5875706214689265, + "grad_norm": 1.4458733797073364, + "learning_rate": 2.418549346016647e-06, + "loss": 0.2906, + "step": 3194 + }, + { + "epoch": 7.589949449895927, + "grad_norm": 1.6465253829956055, + "learning_rate": 2.4161712247324616e-06, + "loss": 0.3963, + "step": 3195 + }, + { + "epoch": 7.592328278322926, + "grad_norm": 1.6548629999160767, + "learning_rate": 2.4137931034482762e-06, + "loss": 0.3724, + "step": 3196 + }, + { + "epoch": 7.594707106749926, + "grad_norm": 1.3724602460861206, + "learning_rate": 2.4114149821640904e-06, + "loss": 0.3724, + "step": 3197 + }, + { + "epoch": 7.597085935176925, + "grad_norm": 1.7387632131576538, + "learning_rate": 2.409036860879905e-06, + "loss": 0.356, + "step": 3198 + }, + { + "epoch": 7.599464763603925, + "grad_norm": 1.2654589414596558, + "learning_rate": 2.4066587395957192e-06, + "loss": 0.2753, + "step": 3199 + }, + { + "epoch": 7.601843592030924, + "grad_norm": 1.5443251132965088, + "learning_rate": 2.4042806183115343e-06, + "loss": 0.3594, + "step": 3200 + }, + { + "epoch": 7.601843592030924, + "eval_loss": 0.42497825622558594, + "eval_runtime": 22.6002, + "eval_samples_per_second": 33.097, + "eval_steps_per_second": 16.549, + "step": 3200 + }, + { + "epoch": 7.6042224204579245, + "grad_norm": 1.4972622394561768, + "learning_rate": 2.4019024970273485e-06, + "loss": 0.3514, + "step": 3201 + }, + { + "epoch": 7.606601248884925, + "grad_norm": 1.6053504943847656, + "learning_rate": 2.399524375743163e-06, + "loss": 0.3417, + "step": 3202 + }, + { + "epoch": 7.608980077311924, + "grad_norm": 1.3861372470855713, + "learning_rate": 2.3971462544589777e-06, + "loss": 0.3483, + "step": 3203 + }, + { + "epoch": 7.611358905738924, + "grad_norm": 1.7831602096557617, + "learning_rate": 2.3947681331747923e-06, + "loss": 0.367, + "step": 3204 + }, + { + "epoch": 7.613737734165923, + "grad_norm": 1.3479951620101929, + "learning_rate": 2.3923900118906065e-06, + "loss": 0.3095, + "step": 3205 + }, + { + "epoch": 7.616116562592923, + "grad_norm": 1.722071647644043, + "learning_rate": 2.390011890606421e-06, + "loss": 0.3721, + "step": 3206 + }, + { + "epoch": 7.6184953910199225, + "grad_norm": 1.2605664730072021, + "learning_rate": 2.3876337693222357e-06, + "loss": 0.32, + "step": 3207 + }, + { + "epoch": 7.620874219446923, + "grad_norm": 1.4468492269515991, + "learning_rate": 2.3852556480380503e-06, + "loss": 0.3259, + "step": 3208 + }, + { + "epoch": 7.623253047873922, + "grad_norm": 1.479659914970398, + "learning_rate": 2.3828775267538645e-06, + "loss": 0.3108, + "step": 3209 + }, + { + "epoch": 7.625631876300922, + "grad_norm": 1.545765995979309, + "learning_rate": 2.380499405469679e-06, + "loss": 0.3442, + "step": 3210 + }, + { + "epoch": 7.628010704727922, + "grad_norm": 1.6254661083221436, + "learning_rate": 2.3781212841854937e-06, + "loss": 0.3711, + "step": 3211 + }, + { + "epoch": 7.630389533154921, + "grad_norm": 1.8455719947814941, + "learning_rate": 2.375743162901308e-06, + "loss": 0.4054, + "step": 3212 + }, + { + "epoch": 7.632768361581921, + "grad_norm": 1.3157535791397095, + "learning_rate": 2.373365041617123e-06, + "loss": 0.2769, + "step": 3213 + }, + { + "epoch": 7.6351471900089205, + "grad_norm": 1.7591217756271362, + "learning_rate": 2.370986920332937e-06, + "loss": 0.3957, + "step": 3214 + }, + { + "epoch": 7.637526018435921, + "grad_norm": 1.5369925498962402, + "learning_rate": 2.3686087990487517e-06, + "loss": 0.3817, + "step": 3215 + }, + { + "epoch": 7.63990484686292, + "grad_norm": 1.386925220489502, + "learning_rate": 2.366230677764566e-06, + "loss": 0.3266, + "step": 3216 + }, + { + "epoch": 7.64228367528992, + "grad_norm": 1.4969427585601807, + "learning_rate": 2.363852556480381e-06, + "loss": 0.3718, + "step": 3217 + }, + { + "epoch": 7.644662503716919, + "grad_norm": 1.7171121835708618, + "learning_rate": 2.361474435196195e-06, + "loss": 0.356, + "step": 3218 + }, + { + "epoch": 7.647041332143919, + "grad_norm": 1.5773299932479858, + "learning_rate": 2.3590963139120098e-06, + "loss": 0.3571, + "step": 3219 + }, + { + "epoch": 7.6494201605709184, + "grad_norm": 1.5871880054473877, + "learning_rate": 2.3567181926278244e-06, + "loss": 0.3847, + "step": 3220 + }, + { + "epoch": 7.6517989889979185, + "grad_norm": 1.567693829536438, + "learning_rate": 2.3543400713436386e-06, + "loss": 0.3394, + "step": 3221 + }, + { + "epoch": 7.654177817424918, + "grad_norm": 1.726014256477356, + "learning_rate": 2.351961950059453e-06, + "loss": 0.3522, + "step": 3222 + }, + { + "epoch": 7.656556645851918, + "grad_norm": 2.0128681659698486, + "learning_rate": 2.3495838287752678e-06, + "loss": 0.3879, + "step": 3223 + }, + { + "epoch": 7.658935474278918, + "grad_norm": 1.673500657081604, + "learning_rate": 2.3472057074910824e-06, + "loss": 0.3341, + "step": 3224 + }, + { + "epoch": 7.661314302705917, + "grad_norm": 1.4813228845596313, + "learning_rate": 2.3448275862068966e-06, + "loss": 0.2902, + "step": 3225 + }, + { + "epoch": 7.663693131132917, + "grad_norm": 1.5459859371185303, + "learning_rate": 2.342449464922711e-06, + "loss": 0.3042, + "step": 3226 + }, + { + "epoch": 7.6660719595599165, + "grad_norm": 1.5695363283157349, + "learning_rate": 2.340071343638526e-06, + "loss": 0.3322, + "step": 3227 + }, + { + "epoch": 7.668450787986917, + "grad_norm": 1.5266709327697754, + "learning_rate": 2.3376932223543404e-06, + "loss": 0.3582, + "step": 3228 + }, + { + "epoch": 7.670829616413916, + "grad_norm": 1.6366214752197266, + "learning_rate": 2.3353151010701546e-06, + "loss": 0.3332, + "step": 3229 + }, + { + "epoch": 7.673208444840916, + "grad_norm": 1.4262956380844116, + "learning_rate": 2.332936979785969e-06, + "loss": 0.3517, + "step": 3230 + }, + { + "epoch": 7.675587273267915, + "grad_norm": 1.5035724639892578, + "learning_rate": 2.330558858501784e-06, + "loss": 0.3089, + "step": 3231 + }, + { + "epoch": 7.677966101694915, + "grad_norm": 1.4583079814910889, + "learning_rate": 2.3281807372175984e-06, + "loss": 0.2988, + "step": 3232 + }, + { + "epoch": 7.680344930121915, + "grad_norm": 1.5728484392166138, + "learning_rate": 2.3258026159334126e-06, + "loss": 0.3385, + "step": 3233 + }, + { + "epoch": 7.6827237585489145, + "grad_norm": 1.5394612550735474, + "learning_rate": 2.3234244946492272e-06, + "loss": 0.3361, + "step": 3234 + }, + { + "epoch": 7.685102586975915, + "grad_norm": 1.313349723815918, + "learning_rate": 2.321046373365042e-06, + "loss": 0.3041, + "step": 3235 + }, + { + "epoch": 7.687481415402914, + "grad_norm": 1.500716209411621, + "learning_rate": 2.318668252080856e-06, + "loss": 0.3274, + "step": 3236 + }, + { + "epoch": 7.689860243829914, + "grad_norm": 1.3671340942382812, + "learning_rate": 2.316290130796671e-06, + "loss": 0.315, + "step": 3237 + }, + { + "epoch": 7.692239072256913, + "grad_norm": 1.5590726137161255, + "learning_rate": 2.3139120095124852e-06, + "loss": 0.3803, + "step": 3238 + }, + { + "epoch": 7.694617900683913, + "grad_norm": 1.495032787322998, + "learning_rate": 2.3115338882283e-06, + "loss": 0.361, + "step": 3239 + }, + { + "epoch": 7.6969967291109125, + "grad_norm": 1.4779114723205566, + "learning_rate": 2.3091557669441145e-06, + "loss": 0.2962, + "step": 3240 + }, + { + "epoch": 7.699375557537913, + "grad_norm": 1.386444330215454, + "learning_rate": 2.306777645659929e-06, + "loss": 0.3386, + "step": 3241 + }, + { + "epoch": 7.701754385964913, + "grad_norm": 1.6075958013534546, + "learning_rate": 2.3043995243757433e-06, + "loss": 0.3904, + "step": 3242 + }, + { + "epoch": 7.704133214391912, + "grad_norm": 1.5916156768798828, + "learning_rate": 2.302021403091558e-06, + "loss": 0.3453, + "step": 3243 + }, + { + "epoch": 7.706512042818912, + "grad_norm": 1.564153790473938, + "learning_rate": 2.2996432818073725e-06, + "loss": 0.3704, + "step": 3244 + }, + { + "epoch": 7.708890871245911, + "grad_norm": 1.640138030052185, + "learning_rate": 2.2972651605231867e-06, + "loss": 0.368, + "step": 3245 + }, + { + "epoch": 7.711269699672911, + "grad_norm": 1.4035396575927734, + "learning_rate": 2.2948870392390013e-06, + "loss": 0.2413, + "step": 3246 + }, + { + "epoch": 7.7136485280999105, + "grad_norm": 1.6582856178283691, + "learning_rate": 2.292508917954816e-06, + "loss": 0.3774, + "step": 3247 + }, + { + "epoch": 7.716027356526911, + "grad_norm": 1.5870928764343262, + "learning_rate": 2.2901307966706305e-06, + "loss": 0.3193, + "step": 3248 + }, + { + "epoch": 7.71840618495391, + "grad_norm": 1.54490065574646, + "learning_rate": 2.2877526753864447e-06, + "loss": 0.3646, + "step": 3249 + }, + { + "epoch": 7.72078501338091, + "grad_norm": 1.3012723922729492, + "learning_rate": 2.2853745541022593e-06, + "loss": 0.3209, + "step": 3250 + }, + { + "epoch": 7.72078501338091, + "eval_loss": 0.42389169335365295, + "eval_runtime": 22.619, + "eval_samples_per_second": 33.07, + "eval_steps_per_second": 16.535, + "step": 3250 + }, + { + "epoch": 7.72316384180791, + "grad_norm": 1.3882126808166504, + "learning_rate": 2.282996432818074e-06, + "loss": 0.2628, + "step": 3251 + }, + { + "epoch": 7.725542670234909, + "grad_norm": 1.4228448867797852, + "learning_rate": 2.2806183115338885e-06, + "loss": 0.3251, + "step": 3252 + }, + { + "epoch": 7.727921498661909, + "grad_norm": 1.6120049953460693, + "learning_rate": 2.2782401902497027e-06, + "loss": 0.3343, + "step": 3253 + }, + { + "epoch": 7.730300327088909, + "grad_norm": 1.6220868825912476, + "learning_rate": 2.2758620689655173e-06, + "loss": 0.4298, + "step": 3254 + }, + { + "epoch": 7.732679155515909, + "grad_norm": 1.703961730003357, + "learning_rate": 2.273483947681332e-06, + "loss": 0.3422, + "step": 3255 + }, + { + "epoch": 7.735057983942908, + "grad_norm": 1.699151873588562, + "learning_rate": 2.2711058263971466e-06, + "loss": 0.3731, + "step": 3256 + }, + { + "epoch": 7.737436812369908, + "grad_norm": 1.514265775680542, + "learning_rate": 2.2687277051129607e-06, + "loss": 0.3386, + "step": 3257 + }, + { + "epoch": 7.739815640796907, + "grad_norm": 1.4858217239379883, + "learning_rate": 2.2663495838287753e-06, + "loss": 0.3025, + "step": 3258 + }, + { + "epoch": 7.742194469223907, + "grad_norm": 1.7378199100494385, + "learning_rate": 2.26397146254459e-06, + "loss": 0.3614, + "step": 3259 + }, + { + "epoch": 7.744573297650907, + "grad_norm": 1.6453464031219482, + "learning_rate": 2.2615933412604046e-06, + "loss": 0.3889, + "step": 3260 + }, + { + "epoch": 7.746952126077907, + "grad_norm": 1.320141077041626, + "learning_rate": 2.259215219976219e-06, + "loss": 0.2731, + "step": 3261 + }, + { + "epoch": 7.749330954504907, + "grad_norm": 1.4212877750396729, + "learning_rate": 2.2568370986920334e-06, + "loss": 0.3335, + "step": 3262 + }, + { + "epoch": 7.751709782931906, + "grad_norm": 1.626476526260376, + "learning_rate": 2.254458977407848e-06, + "loss": 0.4678, + "step": 3263 + }, + { + "epoch": 7.754088611358906, + "grad_norm": 1.5179169178009033, + "learning_rate": 2.2520808561236626e-06, + "loss": 0.331, + "step": 3264 + }, + { + "epoch": 7.756467439785905, + "grad_norm": 1.3198221921920776, + "learning_rate": 2.249702734839477e-06, + "loss": 0.3097, + "step": 3265 + }, + { + "epoch": 7.758846268212905, + "grad_norm": 1.4053378105163574, + "learning_rate": 2.2473246135552914e-06, + "loss": 0.3013, + "step": 3266 + }, + { + "epoch": 7.761225096639905, + "grad_norm": 1.187252402305603, + "learning_rate": 2.244946492271106e-06, + "loss": 0.2605, + "step": 3267 + }, + { + "epoch": 7.763603925066905, + "grad_norm": 1.4929299354553223, + "learning_rate": 2.2425683709869206e-06, + "loss": 0.3242, + "step": 3268 + }, + { + "epoch": 7.765982753493904, + "grad_norm": 1.285935401916504, + "learning_rate": 2.2401902497027352e-06, + "loss": 0.2904, + "step": 3269 + }, + { + "epoch": 7.768361581920904, + "grad_norm": 1.5902963876724243, + "learning_rate": 2.2378121284185494e-06, + "loss": 0.4496, + "step": 3270 + }, + { + "epoch": 7.770740410347903, + "grad_norm": 1.577108383178711, + "learning_rate": 2.235434007134364e-06, + "loss": 0.3567, + "step": 3271 + }, + { + "epoch": 7.773119238774903, + "grad_norm": 1.5428974628448486, + "learning_rate": 2.2330558858501786e-06, + "loss": 0.3829, + "step": 3272 + }, + { + "epoch": 7.775498067201903, + "grad_norm": 1.3443043231964111, + "learning_rate": 2.230677764565993e-06, + "loss": 0.315, + "step": 3273 + }, + { + "epoch": 7.777876895628903, + "grad_norm": 1.4463046789169312, + "learning_rate": 2.2282996432818074e-06, + "loss": 0.293, + "step": 3274 + }, + { + "epoch": 7.780255724055903, + "grad_norm": 1.683605670928955, + "learning_rate": 2.225921521997622e-06, + "loss": 0.3667, + "step": 3275 + }, + { + "epoch": 7.782634552482902, + "grad_norm": 1.3748691082000732, + "learning_rate": 2.2235434007134367e-06, + "loss": 0.3326, + "step": 3276 + }, + { + "epoch": 7.785013380909902, + "grad_norm": 1.388370156288147, + "learning_rate": 2.221165279429251e-06, + "loss": 0.3255, + "step": 3277 + }, + { + "epoch": 7.787392209336901, + "grad_norm": 1.4863688945770264, + "learning_rate": 2.218787158145066e-06, + "loss": 0.2711, + "step": 3278 + }, + { + "epoch": 7.789771037763901, + "grad_norm": 1.6458524465560913, + "learning_rate": 2.21640903686088e-06, + "loss": 0.4041, + "step": 3279 + }, + { + "epoch": 7.7921498661909006, + "grad_norm": 1.3691445589065552, + "learning_rate": 2.2140309155766947e-06, + "loss": 0.3342, + "step": 3280 + }, + { + "epoch": 7.794528694617901, + "grad_norm": 1.524095058441162, + "learning_rate": 2.2116527942925093e-06, + "loss": 0.2897, + "step": 3281 + }, + { + "epoch": 7.796907523044901, + "grad_norm": 1.4536031484603882, + "learning_rate": 2.2092746730083235e-06, + "loss": 0.3803, + "step": 3282 + }, + { + "epoch": 7.7992863514719, + "grad_norm": 1.5976150035858154, + "learning_rate": 2.206896551724138e-06, + "loss": 0.3566, + "step": 3283 + }, + { + "epoch": 7.8016651798989, + "grad_norm": 1.3656028509140015, + "learning_rate": 2.2045184304399527e-06, + "loss": 0.2952, + "step": 3284 + }, + { + "epoch": 7.804044008325899, + "grad_norm": 1.4566000699996948, + "learning_rate": 2.2021403091557673e-06, + "loss": 0.3413, + "step": 3285 + }, + { + "epoch": 7.806422836752899, + "grad_norm": 1.532050371170044, + "learning_rate": 2.1997621878715815e-06, + "loss": 0.3205, + "step": 3286 + }, + { + "epoch": 7.808801665179899, + "grad_norm": 1.3219133615493774, + "learning_rate": 2.197384066587396e-06, + "loss": 0.3142, + "step": 3287 + }, + { + "epoch": 7.811180493606899, + "grad_norm": 1.5488229990005493, + "learning_rate": 2.1950059453032107e-06, + "loss": 0.3159, + "step": 3288 + }, + { + "epoch": 7.813559322033898, + "grad_norm": 1.385267734527588, + "learning_rate": 2.1926278240190253e-06, + "loss": 0.3136, + "step": 3289 + }, + { + "epoch": 7.815938150460898, + "grad_norm": 1.3824723958969116, + "learning_rate": 2.1902497027348395e-06, + "loss": 0.3308, + "step": 3290 + }, + { + "epoch": 7.818316978887898, + "grad_norm": 1.3473165035247803, + "learning_rate": 2.187871581450654e-06, + "loss": 0.3016, + "step": 3291 + }, + { + "epoch": 7.820695807314897, + "grad_norm": 1.671425461769104, + "learning_rate": 2.1854934601664687e-06, + "loss": 0.3287, + "step": 3292 + }, + { + "epoch": 7.823074635741897, + "grad_norm": 1.8494575023651123, + "learning_rate": 2.1831153388822833e-06, + "loss": 0.4625, + "step": 3293 + }, + { + "epoch": 7.825453464168897, + "grad_norm": 1.5258232355117798, + "learning_rate": 2.1807372175980975e-06, + "loss": 0.3104, + "step": 3294 + }, + { + "epoch": 7.827832292595897, + "grad_norm": 1.4781297445297241, + "learning_rate": 2.178359096313912e-06, + "loss": 0.3373, + "step": 3295 + }, + { + "epoch": 7.830211121022896, + "grad_norm": 1.4028934240341187, + "learning_rate": 2.1759809750297268e-06, + "loss": 0.3157, + "step": 3296 + }, + { + "epoch": 7.832589949449896, + "grad_norm": 1.5227335691452026, + "learning_rate": 2.173602853745541e-06, + "loss": 0.3849, + "step": 3297 + }, + { + "epoch": 7.834968777876895, + "grad_norm": 1.5383323431015015, + "learning_rate": 2.171224732461356e-06, + "loss": 0.3445, + "step": 3298 + }, + { + "epoch": 7.837347606303895, + "grad_norm": 1.5795097351074219, + "learning_rate": 2.16884661117717e-06, + "loss": 0.327, + "step": 3299 + }, + { + "epoch": 7.8397264347308955, + "grad_norm": 1.5863862037658691, + "learning_rate": 2.1664684898929848e-06, + "loss": 0.2923, + "step": 3300 + }, + { + "epoch": 7.8397264347308955, + "eval_loss": 0.42450374364852905, + "eval_runtime": 22.6657, + "eval_samples_per_second": 33.001, + "eval_steps_per_second": 16.501, + "step": 3300 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 1.3154478073120117, + "learning_rate": 2.164090368608799e-06, + "loss": 0.2799, + "step": 3301 + }, + { + "epoch": 7.844484091584895, + "grad_norm": 1.413351058959961, + "learning_rate": 2.161712247324614e-06, + "loss": 0.3262, + "step": 3302 + }, + { + "epoch": 7.846862920011894, + "grad_norm": 1.6113295555114746, + "learning_rate": 2.159334126040428e-06, + "loss": 0.3757, + "step": 3303 + }, + { + "epoch": 7.849241748438894, + "grad_norm": 1.6752146482467651, + "learning_rate": 2.156956004756243e-06, + "loss": 0.4082, + "step": 3304 + }, + { + "epoch": 7.851620576865893, + "grad_norm": 1.4386396408081055, + "learning_rate": 2.1545778834720574e-06, + "loss": 0.2628, + "step": 3305 + }, + { + "epoch": 7.853999405292893, + "grad_norm": 1.5418752431869507, + "learning_rate": 2.1521997621878716e-06, + "loss": 0.35, + "step": 3306 + }, + { + "epoch": 7.856378233719893, + "grad_norm": 1.617397427558899, + "learning_rate": 2.149821640903686e-06, + "loss": 0.313, + "step": 3307 + }, + { + "epoch": 7.858757062146893, + "grad_norm": 1.5032774209976196, + "learning_rate": 2.147443519619501e-06, + "loss": 0.3223, + "step": 3308 + }, + { + "epoch": 7.861135890573893, + "grad_norm": 1.3607796430587769, + "learning_rate": 2.1450653983353154e-06, + "loss": 0.3384, + "step": 3309 + }, + { + "epoch": 7.863514719000892, + "grad_norm": 1.4371676445007324, + "learning_rate": 2.1426872770511296e-06, + "loss": 0.2897, + "step": 3310 + }, + { + "epoch": 7.865893547427892, + "grad_norm": 1.5669142007827759, + "learning_rate": 2.1403091557669442e-06, + "loss": 0.3416, + "step": 3311 + }, + { + "epoch": 7.868272375854891, + "grad_norm": 1.8419386148452759, + "learning_rate": 2.137931034482759e-06, + "loss": 0.3711, + "step": 3312 + }, + { + "epoch": 7.8706512042818915, + "grad_norm": 1.4291188716888428, + "learning_rate": 2.1355529131985734e-06, + "loss": 0.3239, + "step": 3313 + }, + { + "epoch": 7.873030032708891, + "grad_norm": 1.5607532262802124, + "learning_rate": 2.1331747919143876e-06, + "loss": 0.3371, + "step": 3314 + }, + { + "epoch": 7.875408861135891, + "grad_norm": 1.4725655317306519, + "learning_rate": 2.1307966706302022e-06, + "loss": 0.2898, + "step": 3315 + }, + { + "epoch": 7.87778768956289, + "grad_norm": 1.422587513923645, + "learning_rate": 2.128418549346017e-06, + "loss": 0.313, + "step": 3316 + }, + { + "epoch": 7.88016651798989, + "grad_norm": 1.450711965560913, + "learning_rate": 2.1260404280618315e-06, + "loss": 0.3345, + "step": 3317 + }, + { + "epoch": 7.882545346416889, + "grad_norm": 1.432166337966919, + "learning_rate": 2.1236623067776456e-06, + "loss": 0.3023, + "step": 3318 + }, + { + "epoch": 7.884924174843889, + "grad_norm": 1.5485718250274658, + "learning_rate": 2.1212841854934603e-06, + "loss": 0.3627, + "step": 3319 + }, + { + "epoch": 7.887303003270889, + "grad_norm": 1.3737921714782715, + "learning_rate": 2.118906064209275e-06, + "loss": 0.3654, + "step": 3320 + }, + { + "epoch": 7.889681831697889, + "grad_norm": 1.20768141746521, + "learning_rate": 2.1165279429250895e-06, + "loss": 0.2205, + "step": 3321 + }, + { + "epoch": 7.892060660124889, + "grad_norm": 1.5668071508407593, + "learning_rate": 2.114149821640904e-06, + "loss": 0.3058, + "step": 3322 + }, + { + "epoch": 7.894439488551888, + "grad_norm": 1.3655834197998047, + "learning_rate": 2.1117717003567183e-06, + "loss": 0.3408, + "step": 3323 + }, + { + "epoch": 7.896818316978888, + "grad_norm": 1.383075475692749, + "learning_rate": 2.109393579072533e-06, + "loss": 0.3571, + "step": 3324 + }, + { + "epoch": 7.899197145405887, + "grad_norm": 1.542683482170105, + "learning_rate": 2.107015457788347e-06, + "loss": 0.309, + "step": 3325 + }, + { + "epoch": 7.9015759738328875, + "grad_norm": 1.7908352613449097, + "learning_rate": 2.104637336504162e-06, + "loss": 0.3746, + "step": 3326 + }, + { + "epoch": 7.903954802259887, + "grad_norm": 1.4961248636245728, + "learning_rate": 2.1022592152199763e-06, + "loss": 0.2738, + "step": 3327 + }, + { + "epoch": 7.906333630686887, + "grad_norm": 1.5349233150482178, + "learning_rate": 2.099881093935791e-06, + "loss": 0.3652, + "step": 3328 + }, + { + "epoch": 7.908712459113886, + "grad_norm": 1.5840017795562744, + "learning_rate": 2.0975029726516055e-06, + "loss": 0.3318, + "step": 3329 + }, + { + "epoch": 7.911091287540886, + "grad_norm": 1.4789258241653442, + "learning_rate": 2.09512485136742e-06, + "loss": 0.3599, + "step": 3330 + }, + { + "epoch": 7.913470115967886, + "grad_norm": 1.5279736518859863, + "learning_rate": 2.0927467300832343e-06, + "loss": 0.3246, + "step": 3331 + }, + { + "epoch": 7.915848944394885, + "grad_norm": 1.7751861810684204, + "learning_rate": 2.090368608799049e-06, + "loss": 0.2995, + "step": 3332 + }, + { + "epoch": 7.9182277728218855, + "grad_norm": 1.5990631580352783, + "learning_rate": 2.0879904875148635e-06, + "loss": 0.3247, + "step": 3333 + }, + { + "epoch": 7.920606601248885, + "grad_norm": 1.6441344022750854, + "learning_rate": 2.0856123662306777e-06, + "loss": 0.3412, + "step": 3334 + }, + { + "epoch": 7.922985429675885, + "grad_norm": 1.5585304498672485, + "learning_rate": 2.0832342449464923e-06, + "loss": 0.327, + "step": 3335 + }, + { + "epoch": 7.925364258102884, + "grad_norm": 1.4753824472427368, + "learning_rate": 2.080856123662307e-06, + "loss": 0.3756, + "step": 3336 + }, + { + "epoch": 7.927743086529884, + "grad_norm": 1.3875900506973267, + "learning_rate": 2.0784780023781216e-06, + "loss": 0.3186, + "step": 3337 + }, + { + "epoch": 7.930121914956883, + "grad_norm": 1.5099917650222778, + "learning_rate": 2.0760998810939357e-06, + "loss": 0.2741, + "step": 3338 + }, + { + "epoch": 7.9325007433838834, + "grad_norm": 1.454820990562439, + "learning_rate": 2.0737217598097508e-06, + "loss": 0.286, + "step": 3339 + }, + { + "epoch": 7.9348795718108835, + "grad_norm": 1.4284183979034424, + "learning_rate": 2.071343638525565e-06, + "loss": 0.3343, + "step": 3340 + }, + { + "epoch": 7.937258400237883, + "grad_norm": 1.480200171470642, + "learning_rate": 2.0689655172413796e-06, + "loss": 0.2827, + "step": 3341 + }, + { + "epoch": 7.939637228664883, + "grad_norm": 1.6284431219100952, + "learning_rate": 2.0665873959571938e-06, + "loss": 0.3214, + "step": 3342 + }, + { + "epoch": 7.942016057091882, + "grad_norm": 1.515316128730774, + "learning_rate": 2.0642092746730084e-06, + "loss": 0.3155, + "step": 3343 + }, + { + "epoch": 7.944394885518882, + "grad_norm": 1.5584819316864014, + "learning_rate": 2.061831153388823e-06, + "loss": 0.3014, + "step": 3344 + }, + { + "epoch": 7.946773713945881, + "grad_norm": 1.644065499305725, + "learning_rate": 2.0594530321046376e-06, + "loss": 0.3693, + "step": 3345 + }, + { + "epoch": 7.9491525423728815, + "grad_norm": 1.6997952461242676, + "learning_rate": 2.0570749108204522e-06, + "loss": 0.3404, + "step": 3346 + }, + { + "epoch": 7.951531370799881, + "grad_norm": 1.2940443754196167, + "learning_rate": 2.0546967895362664e-06, + "loss": 0.285, + "step": 3347 + }, + { + "epoch": 7.953910199226881, + "grad_norm": 1.5254204273223877, + "learning_rate": 2.052318668252081e-06, + "loss": 0.2955, + "step": 3348 + }, + { + "epoch": 7.956289027653881, + "grad_norm": 1.5617763996124268, + "learning_rate": 2.049940546967895e-06, + "loss": 0.3283, + "step": 3349 + }, + { + "epoch": 7.95866785608088, + "grad_norm": 1.8502612113952637, + "learning_rate": 2.0475624256837102e-06, + "loss": 0.323, + "step": 3350 + }, + { + "epoch": 7.95866785608088, + "eval_loss": 0.4246118664741516, + "eval_runtime": 22.6292, + "eval_samples_per_second": 33.055, + "eval_steps_per_second": 16.527, + "step": 3350 + }, + { + "epoch": 7.96104668450788, + "grad_norm": 1.4865460395812988, + "learning_rate": 2.0451843043995244e-06, + "loss": 0.3377, + "step": 3351 + }, + { + "epoch": 7.963425512934879, + "grad_norm": 1.345357894897461, + "learning_rate": 2.042806183115339e-06, + "loss": 0.2895, + "step": 3352 + }, + { + "epoch": 7.9658043413618795, + "grad_norm": 1.4628435373306274, + "learning_rate": 2.0404280618311536e-06, + "loss": 0.4117, + "step": 3353 + }, + { + "epoch": 7.968183169788879, + "grad_norm": 1.6087473630905151, + "learning_rate": 2.0380499405469683e-06, + "loss": 0.3441, + "step": 3354 + }, + { + "epoch": 7.970561998215879, + "grad_norm": 1.6876890659332275, + "learning_rate": 2.0356718192627824e-06, + "loss": 0.4073, + "step": 3355 + }, + { + "epoch": 7.972940826642878, + "grad_norm": 1.698161005973816, + "learning_rate": 2.033293697978597e-06, + "loss": 0.3594, + "step": 3356 + }, + { + "epoch": 7.975319655069878, + "grad_norm": 1.529119610786438, + "learning_rate": 2.0309155766944117e-06, + "loss": 0.3391, + "step": 3357 + }, + { + "epoch": 7.977698483496878, + "grad_norm": 1.639570951461792, + "learning_rate": 2.0285374554102263e-06, + "loss": 0.333, + "step": 3358 + }, + { + "epoch": 7.9800773119238775, + "grad_norm": 1.361391305923462, + "learning_rate": 2.0261593341260405e-06, + "loss": 0.2917, + "step": 3359 + }, + { + "epoch": 7.982456140350877, + "grad_norm": 1.362161636352539, + "learning_rate": 2.023781212841855e-06, + "loss": 0.3155, + "step": 3360 + }, + { + "epoch": 7.984834968777877, + "grad_norm": 1.5175408124923706, + "learning_rate": 2.0214030915576697e-06, + "loss": 0.3331, + "step": 3361 + }, + { + "epoch": 7.987213797204877, + "grad_norm": 1.5390911102294922, + "learning_rate": 2.019024970273484e-06, + "loss": 0.3642, + "step": 3362 + }, + { + "epoch": 7.989592625631876, + "grad_norm": 1.5496313571929932, + "learning_rate": 2.016646848989299e-06, + "loss": 0.2886, + "step": 3363 + }, + { + "epoch": 7.991971454058876, + "grad_norm": 1.3979161977767944, + "learning_rate": 2.014268727705113e-06, + "loss": 0.2977, + "step": 3364 + }, + { + "epoch": 7.994350282485875, + "grad_norm": 1.5167146921157837, + "learning_rate": 2.0118906064209277e-06, + "loss": 0.2558, + "step": 3365 + }, + { + "epoch": 7.9967291109128755, + "grad_norm": 1.5513628721237183, + "learning_rate": 2.009512485136742e-06, + "loss": 0.435, + "step": 3366 + }, + { + "epoch": 7.999107939339875, + "grad_norm": 1.586025595664978, + "learning_rate": 2.007134363852557e-06, + "loss": 0.3244, + "step": 3367 + }, + { + "epoch": 8.0, + "grad_norm": 3.2887303829193115, + "learning_rate": 2.004756242568371e-06, + "loss": 0.3289, + "step": 3368 + }, + { + "epoch": 8.002378828427, + "grad_norm": 1.6042978763580322, + "learning_rate": 2.0023781212841857e-06, + "loss": 0.3288, + "step": 3369 + }, + { + "epoch": 8.004757656854, + "grad_norm": 1.5526121854782104, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.3937, + "step": 3370 + }, + { + "epoch": 8.007136485280999, + "grad_norm": 1.5970600843429565, + "learning_rate": 1.9976218787158145e-06, + "loss": 0.3451, + "step": 3371 + }, + { + "epoch": 8.009515313707999, + "grad_norm": 1.5648374557495117, + "learning_rate": 1.995243757431629e-06, + "loss": 0.3365, + "step": 3372 + }, + { + "epoch": 8.011894142134999, + "grad_norm": 1.4162542819976807, + "learning_rate": 1.9928656361474437e-06, + "loss": 0.3378, + "step": 3373 + }, + { + "epoch": 8.014272970561999, + "grad_norm": 1.5529794692993164, + "learning_rate": 1.9904875148632584e-06, + "loss": 0.3447, + "step": 3374 + }, + { + "epoch": 8.016651798988997, + "grad_norm": 1.365196943283081, + "learning_rate": 1.9881093935790725e-06, + "loss": 0.3298, + "step": 3375 + }, + { + "epoch": 8.019030627415997, + "grad_norm": 1.6008450984954834, + "learning_rate": 1.985731272294887e-06, + "loss": 0.3812, + "step": 3376 + }, + { + "epoch": 8.021409455842997, + "grad_norm": 1.3331999778747559, + "learning_rate": 1.9833531510107018e-06, + "loss": 0.2895, + "step": 3377 + }, + { + "epoch": 8.023788284269997, + "grad_norm": 1.8346915245056152, + "learning_rate": 1.9809750297265164e-06, + "loss": 0.3922, + "step": 3378 + }, + { + "epoch": 8.026167112696998, + "grad_norm": 1.410758376121521, + "learning_rate": 1.9785969084423306e-06, + "loss": 0.317, + "step": 3379 + }, + { + "epoch": 8.028545941123996, + "grad_norm": 1.5187331438064575, + "learning_rate": 1.976218787158145e-06, + "loss": 0.3747, + "step": 3380 + }, + { + "epoch": 8.030924769550996, + "grad_norm": 1.4770883321762085, + "learning_rate": 1.9738406658739598e-06, + "loss": 0.3489, + "step": 3381 + }, + { + "epoch": 8.033303597977996, + "grad_norm": 1.5907210111618042, + "learning_rate": 1.9714625445897744e-06, + "loss": 0.4075, + "step": 3382 + }, + { + "epoch": 8.035682426404996, + "grad_norm": 1.5611907243728638, + "learning_rate": 1.9690844233055886e-06, + "loss": 0.3291, + "step": 3383 + }, + { + "epoch": 8.038061254831995, + "grad_norm": 1.633109211921692, + "learning_rate": 1.966706302021403e-06, + "loss": 0.3112, + "step": 3384 + }, + { + "epoch": 8.040440083258995, + "grad_norm": 1.7585550546646118, + "learning_rate": 1.964328180737218e-06, + "loss": 0.3608, + "step": 3385 + }, + { + "epoch": 8.042818911685995, + "grad_norm": 1.5994277000427246, + "learning_rate": 1.961950059453032e-06, + "loss": 0.2928, + "step": 3386 + }, + { + "epoch": 8.045197740112995, + "grad_norm": 1.534075379371643, + "learning_rate": 1.959571938168847e-06, + "loss": 0.2903, + "step": 3387 + }, + { + "epoch": 8.047576568539995, + "grad_norm": 1.4424840211868286, + "learning_rate": 1.957193816884661e-06, + "loss": 0.3093, + "step": 3388 + }, + { + "epoch": 8.049955396966993, + "grad_norm": 1.6643208265304565, + "learning_rate": 1.954815695600476e-06, + "loss": 0.3366, + "step": 3389 + }, + { + "epoch": 8.052334225393993, + "grad_norm": 1.5429790019989014, + "learning_rate": 1.95243757431629e-06, + "loss": 0.3524, + "step": 3390 + }, + { + "epoch": 8.054713053820993, + "grad_norm": 1.6027212142944336, + "learning_rate": 1.950059453032105e-06, + "loss": 0.386, + "step": 3391 + }, + { + "epoch": 8.057091882247994, + "grad_norm": 1.3652968406677246, + "learning_rate": 1.9476813317479192e-06, + "loss": 0.34, + "step": 3392 + }, + { + "epoch": 8.059470710674992, + "grad_norm": 1.4071269035339355, + "learning_rate": 1.945303210463734e-06, + "loss": 0.3301, + "step": 3393 + }, + { + "epoch": 8.061849539101992, + "grad_norm": 1.6511948108673096, + "learning_rate": 1.9429250891795485e-06, + "loss": 0.3532, + "step": 3394 + }, + { + "epoch": 8.064228367528992, + "grad_norm": 1.5096168518066406, + "learning_rate": 1.9405469678953626e-06, + "loss": 0.3617, + "step": 3395 + }, + { + "epoch": 8.066607195955992, + "grad_norm": 1.5085537433624268, + "learning_rate": 1.9381688466111773e-06, + "loss": 0.4086, + "step": 3396 + }, + { + "epoch": 8.06898602438299, + "grad_norm": 1.5266300439834595, + "learning_rate": 1.935790725326992e-06, + "loss": 0.3378, + "step": 3397 + }, + { + "epoch": 8.07136485280999, + "grad_norm": 1.2959940433502197, + "learning_rate": 1.9334126040428065e-06, + "loss": 0.2976, + "step": 3398 + }, + { + "epoch": 8.07374368123699, + "grad_norm": 1.573110818862915, + "learning_rate": 1.9310344827586207e-06, + "loss": 0.3264, + "step": 3399 + }, + { + "epoch": 8.07612250966399, + "grad_norm": 1.6520140171051025, + "learning_rate": 1.9286563614744353e-06, + "loss": 0.3932, + "step": 3400 + }, + { + "epoch": 8.07612250966399, + "eval_loss": 0.4245797097682953, + "eval_runtime": 25.249, + "eval_samples_per_second": 29.625, + "eval_steps_per_second": 14.812, + "step": 3400 + }, + { + "epoch": 8.078501338090991, + "grad_norm": 1.5294541120529175, + "learning_rate": 1.92627824019025e-06, + "loss": 0.2853, + "step": 3401 + }, + { + "epoch": 8.08088016651799, + "grad_norm": 1.3718730211257935, + "learning_rate": 1.9239001189060645e-06, + "loss": 0.3257, + "step": 3402 + }, + { + "epoch": 8.08325899494499, + "grad_norm": 1.4249407052993774, + "learning_rate": 1.9215219976218787e-06, + "loss": 0.2929, + "step": 3403 + }, + { + "epoch": 8.08563782337199, + "grad_norm": 1.384666085243225, + "learning_rate": 1.9191438763376933e-06, + "loss": 0.2906, + "step": 3404 + }, + { + "epoch": 8.08801665179899, + "grad_norm": 1.530799388885498, + "learning_rate": 1.916765755053508e-06, + "loss": 0.3056, + "step": 3405 + }, + { + "epoch": 8.090395480225988, + "grad_norm": 1.5486053228378296, + "learning_rate": 1.9143876337693225e-06, + "loss": 0.3699, + "step": 3406 + }, + { + "epoch": 8.092774308652988, + "grad_norm": 1.5500376224517822, + "learning_rate": 1.9120095124851367e-06, + "loss": 0.3328, + "step": 3407 + }, + { + "epoch": 8.095153137079988, + "grad_norm": 1.4043387174606323, + "learning_rate": 1.9096313912009513e-06, + "loss": 0.3073, + "step": 3408 + }, + { + "epoch": 8.097531965506988, + "grad_norm": 1.5323762893676758, + "learning_rate": 1.907253269916766e-06, + "loss": 0.3573, + "step": 3409 + }, + { + "epoch": 8.099910793933988, + "grad_norm": 1.733586311340332, + "learning_rate": 1.9048751486325803e-06, + "loss": 0.3627, + "step": 3410 + }, + { + "epoch": 8.102289622360987, + "grad_norm": 1.44130539894104, + "learning_rate": 1.902497027348395e-06, + "loss": 0.281, + "step": 3411 + }, + { + "epoch": 8.104668450787987, + "grad_norm": 1.4701629877090454, + "learning_rate": 1.9001189060642095e-06, + "loss": 0.2894, + "step": 3412 + }, + { + "epoch": 8.107047279214987, + "grad_norm": 1.6174763441085815, + "learning_rate": 1.897740784780024e-06, + "loss": 0.3202, + "step": 3413 + }, + { + "epoch": 8.109426107641987, + "grad_norm": 1.3825629949569702, + "learning_rate": 1.8953626634958383e-06, + "loss": 0.2612, + "step": 3414 + }, + { + "epoch": 8.111804936068985, + "grad_norm": 1.5997891426086426, + "learning_rate": 1.892984542211653e-06, + "loss": 0.359, + "step": 3415 + }, + { + "epoch": 8.114183764495985, + "grad_norm": 1.429862141609192, + "learning_rate": 1.8906064209274674e-06, + "loss": 0.328, + "step": 3416 + }, + { + "epoch": 8.116562592922985, + "grad_norm": 1.4490629434585571, + "learning_rate": 1.888228299643282e-06, + "loss": 0.345, + "step": 3417 + }, + { + "epoch": 8.118941421349986, + "grad_norm": 1.5563952922821045, + "learning_rate": 1.8858501783590966e-06, + "loss": 0.3904, + "step": 3418 + }, + { + "epoch": 8.121320249776986, + "grad_norm": 1.5042918920516968, + "learning_rate": 1.883472057074911e-06, + "loss": 0.3149, + "step": 3419 + }, + { + "epoch": 8.123699078203984, + "grad_norm": 1.3745828866958618, + "learning_rate": 1.8810939357907254e-06, + "loss": 0.3214, + "step": 3420 + }, + { + "epoch": 8.126077906630984, + "grad_norm": 1.5733627080917358, + "learning_rate": 1.8787158145065402e-06, + "loss": 0.3674, + "step": 3421 + }, + { + "epoch": 8.128456735057984, + "grad_norm": 1.5564693212509155, + "learning_rate": 1.8763376932223546e-06, + "loss": 0.3243, + "step": 3422 + }, + { + "epoch": 8.130835563484984, + "grad_norm": 1.445373296737671, + "learning_rate": 1.873959571938169e-06, + "loss": 0.3479, + "step": 3423 + }, + { + "epoch": 8.133214391911983, + "grad_norm": 1.4341164827346802, + "learning_rate": 1.8715814506539834e-06, + "loss": 0.3634, + "step": 3424 + }, + { + "epoch": 8.135593220338983, + "grad_norm": 1.5329346656799316, + "learning_rate": 1.869203329369798e-06, + "loss": 0.3651, + "step": 3425 + }, + { + "epoch": 8.137972048765983, + "grad_norm": 1.5373605489730835, + "learning_rate": 1.8668252080856126e-06, + "loss": 0.3197, + "step": 3426 + }, + { + "epoch": 8.140350877192983, + "grad_norm": 1.731713891029358, + "learning_rate": 1.864447086801427e-06, + "loss": 0.3442, + "step": 3427 + }, + { + "epoch": 8.142729705619983, + "grad_norm": 1.6004726886749268, + "learning_rate": 1.8620689655172416e-06, + "loss": 0.3086, + "step": 3428 + }, + { + "epoch": 8.145108534046981, + "grad_norm": 1.5897979736328125, + "learning_rate": 1.859690844233056e-06, + "loss": 0.369, + "step": 3429 + }, + { + "epoch": 8.147487362473981, + "grad_norm": 1.362380027770996, + "learning_rate": 1.8573127229488704e-06, + "loss": 0.3144, + "step": 3430 + }, + { + "epoch": 8.149866190900982, + "grad_norm": 1.605521321296692, + "learning_rate": 1.8549346016646848e-06, + "loss": 0.367, + "step": 3431 + }, + { + "epoch": 8.152245019327982, + "grad_norm": 1.8663443326950073, + "learning_rate": 1.8525564803804996e-06, + "loss": 0.3869, + "step": 3432 + }, + { + "epoch": 8.15462384775498, + "grad_norm": 1.5225039720535278, + "learning_rate": 1.850178359096314e-06, + "loss": 0.3155, + "step": 3433 + }, + { + "epoch": 8.15700267618198, + "grad_norm": 1.433372974395752, + "learning_rate": 1.8478002378121284e-06, + "loss": 0.305, + "step": 3434 + }, + { + "epoch": 8.15938150460898, + "grad_norm": 1.6972202062606812, + "learning_rate": 1.8454221165279433e-06, + "loss": 0.3494, + "step": 3435 + }, + { + "epoch": 8.16176033303598, + "grad_norm": 1.6394321918487549, + "learning_rate": 1.8430439952437577e-06, + "loss": 0.3468, + "step": 3436 + }, + { + "epoch": 8.16413916146298, + "grad_norm": 1.466179609298706, + "learning_rate": 1.840665873959572e-06, + "loss": 0.3105, + "step": 3437 + }, + { + "epoch": 8.166517989889979, + "grad_norm": 1.3752061128616333, + "learning_rate": 1.8382877526753867e-06, + "loss": 0.3631, + "step": 3438 + }, + { + "epoch": 8.168896818316979, + "grad_norm": 1.318755030632019, + "learning_rate": 1.835909631391201e-06, + "loss": 0.2712, + "step": 3439 + }, + { + "epoch": 8.171275646743979, + "grad_norm": 1.402716875076294, + "learning_rate": 1.8335315101070155e-06, + "loss": 0.2899, + "step": 3440 + }, + { + "epoch": 8.173654475170979, + "grad_norm": 1.5484344959259033, + "learning_rate": 1.83115338882283e-06, + "loss": 0.2711, + "step": 3441 + }, + { + "epoch": 8.176033303597977, + "grad_norm": 1.5875523090362549, + "learning_rate": 1.8287752675386447e-06, + "loss": 0.3581, + "step": 3442 + }, + { + "epoch": 8.178412132024977, + "grad_norm": 1.3532395362854004, + "learning_rate": 1.826397146254459e-06, + "loss": 0.2952, + "step": 3443 + }, + { + "epoch": 8.180790960451978, + "grad_norm": 1.5124809741973877, + "learning_rate": 1.8240190249702735e-06, + "loss": 0.3192, + "step": 3444 + }, + { + "epoch": 8.183169788878978, + "grad_norm": 1.4305963516235352, + "learning_rate": 1.8216409036860883e-06, + "loss": 0.2509, + "step": 3445 + }, + { + "epoch": 8.185548617305976, + "grad_norm": 1.4705866575241089, + "learning_rate": 1.8192627824019027e-06, + "loss": 0.3746, + "step": 3446 + }, + { + "epoch": 8.187927445732976, + "grad_norm": 1.5488736629486084, + "learning_rate": 1.8168846611177171e-06, + "loss": 0.2802, + "step": 3447 + }, + { + "epoch": 8.190306274159976, + "grad_norm": 1.5453338623046875, + "learning_rate": 1.8145065398335315e-06, + "loss": 0.3344, + "step": 3448 + }, + { + "epoch": 8.192685102586976, + "grad_norm": 1.5530738830566406, + "learning_rate": 1.8121284185493463e-06, + "loss": 0.3724, + "step": 3449 + }, + { + "epoch": 8.195063931013976, + "grad_norm": 1.6495709419250488, + "learning_rate": 1.8097502972651607e-06, + "loss": 0.3503, + "step": 3450 + }, + { + "epoch": 8.195063931013976, + "eval_loss": 0.42648646235466003, + "eval_runtime": 22.7215, + "eval_samples_per_second": 32.92, + "eval_steps_per_second": 16.46, + "step": 3450 + }, + { + "epoch": 8.197442759440975, + "grad_norm": 1.4008891582489014, + "learning_rate": 1.8073721759809751e-06, + "loss": 0.3178, + "step": 3451 + }, + { + "epoch": 8.199821587867975, + "grad_norm": 1.4203789234161377, + "learning_rate": 1.8049940546967897e-06, + "loss": 0.3338, + "step": 3452 + }, + { + "epoch": 8.202200416294975, + "grad_norm": 1.47443425655365, + "learning_rate": 1.8026159334126041e-06, + "loss": 0.2979, + "step": 3453 + }, + { + "epoch": 8.204579244721975, + "grad_norm": 1.4105890989303589, + "learning_rate": 1.8002378121284185e-06, + "loss": 0.2646, + "step": 3454 + }, + { + "epoch": 8.206958073148973, + "grad_norm": 1.6168416738510132, + "learning_rate": 1.7978596908442334e-06, + "loss": 0.3325, + "step": 3455 + }, + { + "epoch": 8.209336901575973, + "grad_norm": 1.8929089307785034, + "learning_rate": 1.7954815695600478e-06, + "loss": 0.406, + "step": 3456 + }, + { + "epoch": 8.211715730002974, + "grad_norm": 1.413111686706543, + "learning_rate": 1.7931034482758622e-06, + "loss": 0.3101, + "step": 3457 + }, + { + "epoch": 8.214094558429974, + "grad_norm": 1.4979767799377441, + "learning_rate": 1.7907253269916766e-06, + "loss": 0.3453, + "step": 3458 + }, + { + "epoch": 8.216473386856974, + "grad_norm": 1.4909569025039673, + "learning_rate": 1.7883472057074914e-06, + "loss": 0.2974, + "step": 3459 + }, + { + "epoch": 8.218852215283972, + "grad_norm": 1.6970487833023071, + "learning_rate": 1.7859690844233058e-06, + "loss": 0.3122, + "step": 3460 + }, + { + "epoch": 8.221231043710972, + "grad_norm": 1.6165131330490112, + "learning_rate": 1.7835909631391202e-06, + "loss": 0.3226, + "step": 3461 + }, + { + "epoch": 8.223609872137972, + "grad_norm": 1.6634888648986816, + "learning_rate": 1.7812128418549348e-06, + "loss": 0.3311, + "step": 3462 + }, + { + "epoch": 8.225988700564972, + "grad_norm": 1.5790249109268188, + "learning_rate": 1.7788347205707492e-06, + "loss": 0.3211, + "step": 3463 + }, + { + "epoch": 8.22836752899197, + "grad_norm": 1.4003212451934814, + "learning_rate": 1.7764565992865638e-06, + "loss": 0.3669, + "step": 3464 + }, + { + "epoch": 8.23074635741897, + "grad_norm": 1.5556995868682861, + "learning_rate": 1.7740784780023782e-06, + "loss": 0.3428, + "step": 3465 + }, + { + "epoch": 8.23312518584597, + "grad_norm": 1.4662708044052124, + "learning_rate": 1.7717003567181928e-06, + "loss": 0.3289, + "step": 3466 + }, + { + "epoch": 8.235504014272971, + "grad_norm": 1.6103758811950684, + "learning_rate": 1.7693222354340072e-06, + "loss": 0.2618, + "step": 3467 + }, + { + "epoch": 8.237882842699971, + "grad_norm": 1.5142284631729126, + "learning_rate": 1.7669441141498216e-06, + "loss": 0.3578, + "step": 3468 + }, + { + "epoch": 8.24026167112697, + "grad_norm": 1.5182214975357056, + "learning_rate": 1.7645659928656364e-06, + "loss": 0.3191, + "step": 3469 + }, + { + "epoch": 8.24264049955397, + "grad_norm": 1.3994064331054688, + "learning_rate": 1.7621878715814508e-06, + "loss": 0.2871, + "step": 3470 + }, + { + "epoch": 8.24501932798097, + "grad_norm": 1.5528161525726318, + "learning_rate": 1.7598097502972652e-06, + "loss": 0.2997, + "step": 3471 + }, + { + "epoch": 8.24739815640797, + "grad_norm": 1.4415440559387207, + "learning_rate": 1.7574316290130796e-06, + "loss": 0.2935, + "step": 3472 + }, + { + "epoch": 8.249776984834968, + "grad_norm": 1.689462661743164, + "learning_rate": 1.7550535077288945e-06, + "loss": 0.4171, + "step": 3473 + }, + { + "epoch": 8.252155813261968, + "grad_norm": 1.7793418169021606, + "learning_rate": 1.7526753864447089e-06, + "loss": 0.3212, + "step": 3474 + }, + { + "epoch": 8.254534641688968, + "grad_norm": 1.4607651233673096, + "learning_rate": 1.7502972651605233e-06, + "loss": 0.286, + "step": 3475 + }, + { + "epoch": 8.256913470115968, + "grad_norm": 1.5351417064666748, + "learning_rate": 1.7479191438763379e-06, + "loss": 0.2764, + "step": 3476 + }, + { + "epoch": 8.259292298542968, + "grad_norm": 1.4049952030181885, + "learning_rate": 1.7455410225921523e-06, + "loss": 0.2916, + "step": 3477 + }, + { + "epoch": 8.261671126969967, + "grad_norm": 1.7382100820541382, + "learning_rate": 1.7431629013079669e-06, + "loss": 0.3634, + "step": 3478 + }, + { + "epoch": 8.264049955396967, + "grad_norm": 1.5672014951705933, + "learning_rate": 1.7407847800237815e-06, + "loss": 0.3598, + "step": 3479 + }, + { + "epoch": 8.266428783823967, + "grad_norm": 1.6722278594970703, + "learning_rate": 1.7384066587395959e-06, + "loss": 0.4073, + "step": 3480 + }, + { + "epoch": 8.268807612250967, + "grad_norm": 1.6069480180740356, + "learning_rate": 1.7360285374554103e-06, + "loss": 0.3255, + "step": 3481 + }, + { + "epoch": 8.271186440677965, + "grad_norm": 1.6703355312347412, + "learning_rate": 1.7336504161712247e-06, + "loss": 0.3792, + "step": 3482 + }, + { + "epoch": 8.273565269104965, + "grad_norm": 1.7409298419952393, + "learning_rate": 1.7312722948870395e-06, + "loss": 0.3403, + "step": 3483 + }, + { + "epoch": 8.275944097531966, + "grad_norm": 1.8443559408187866, + "learning_rate": 1.728894173602854e-06, + "loss": 0.4112, + "step": 3484 + }, + { + "epoch": 8.278322925958966, + "grad_norm": 1.4114360809326172, + "learning_rate": 1.7265160523186683e-06, + "loss": 0.2929, + "step": 3485 + }, + { + "epoch": 8.280701754385966, + "grad_norm": 1.5952104330062866, + "learning_rate": 1.724137931034483e-06, + "loss": 0.3105, + "step": 3486 + }, + { + "epoch": 8.283080582812964, + "grad_norm": 1.4474912881851196, + "learning_rate": 1.7217598097502975e-06, + "loss": 0.312, + "step": 3487 + }, + { + "epoch": 8.285459411239964, + "grad_norm": 1.9034007787704468, + "learning_rate": 1.719381688466112e-06, + "loss": 0.3607, + "step": 3488 + }, + { + "epoch": 8.287838239666964, + "grad_norm": 1.5156935453414917, + "learning_rate": 1.7170035671819263e-06, + "loss": 0.2904, + "step": 3489 + }, + { + "epoch": 8.290217068093964, + "grad_norm": 1.663345217704773, + "learning_rate": 1.714625445897741e-06, + "loss": 0.3649, + "step": 3490 + }, + { + "epoch": 8.292595896520963, + "grad_norm": 1.1874653100967407, + "learning_rate": 1.7122473246135553e-06, + "loss": 0.2776, + "step": 3491 + }, + { + "epoch": 8.294974724947963, + "grad_norm": 1.8515570163726807, + "learning_rate": 1.70986920332937e-06, + "loss": 0.3825, + "step": 3492 + }, + { + "epoch": 8.297353553374963, + "grad_norm": 1.6239577531814575, + "learning_rate": 1.7074910820451846e-06, + "loss": 0.3231, + "step": 3493 + }, + { + "epoch": 8.299732381801963, + "grad_norm": 1.6760371923446655, + "learning_rate": 1.705112960760999e-06, + "loss": 0.3708, + "step": 3494 + }, + { + "epoch": 8.302111210228961, + "grad_norm": 1.6858165264129639, + "learning_rate": 1.7027348394768134e-06, + "loss": 0.3528, + "step": 3495 + }, + { + "epoch": 8.304490038655961, + "grad_norm": 1.525329828262329, + "learning_rate": 1.7003567181926282e-06, + "loss": 0.32, + "step": 3496 + }, + { + "epoch": 8.306868867082962, + "grad_norm": 1.5275418758392334, + "learning_rate": 1.6979785969084426e-06, + "loss": 0.337, + "step": 3497 + }, + { + "epoch": 8.309247695509962, + "grad_norm": 1.5244156122207642, + "learning_rate": 1.695600475624257e-06, + "loss": 0.3088, + "step": 3498 + }, + { + "epoch": 8.311626523936962, + "grad_norm": 1.5366432666778564, + "learning_rate": 1.6932223543400714e-06, + "loss": 0.3404, + "step": 3499 + }, + { + "epoch": 8.31400535236396, + "grad_norm": 1.3460941314697266, + "learning_rate": 1.690844233055886e-06, + "loss": 0.3328, + "step": 3500 + }, + { + "epoch": 8.31400535236396, + "eval_loss": 0.42587482929229736, + "eval_runtime": 22.5517, + "eval_samples_per_second": 33.168, + "eval_steps_per_second": 16.584, + "step": 3500 + }, + { + "epoch": 8.31638418079096, + "grad_norm": 1.435987949371338, + "learning_rate": 1.6884661117717006e-06, + "loss": 0.3055, + "step": 3501 + }, + { + "epoch": 8.31876300921796, + "grad_norm": 1.7395353317260742, + "learning_rate": 1.686087990487515e-06, + "loss": 0.3364, + "step": 3502 + }, + { + "epoch": 8.32114183764496, + "grad_norm": 1.450340986251831, + "learning_rate": 1.6837098692033296e-06, + "loss": 0.3695, + "step": 3503 + }, + { + "epoch": 8.323520666071959, + "grad_norm": 1.3327326774597168, + "learning_rate": 1.681331747919144e-06, + "loss": 0.3013, + "step": 3504 + }, + { + "epoch": 8.325899494498959, + "grad_norm": 1.5957725048065186, + "learning_rate": 1.6789536266349584e-06, + "loss": 0.3344, + "step": 3505 + }, + { + "epoch": 8.328278322925959, + "grad_norm": 1.6245731115341187, + "learning_rate": 1.6765755053507728e-06, + "loss": 0.2974, + "step": 3506 + }, + { + "epoch": 8.330657151352959, + "grad_norm": 1.3245279788970947, + "learning_rate": 1.6741973840665876e-06, + "loss": 0.2688, + "step": 3507 + }, + { + "epoch": 8.33303597977996, + "grad_norm": 1.702358365058899, + "learning_rate": 1.671819262782402e-06, + "loss": 0.3347, + "step": 3508 + }, + { + "epoch": 8.335414808206957, + "grad_norm": 1.4761947393417358, + "learning_rate": 1.6694411414982164e-06, + "loss": 0.2228, + "step": 3509 + }, + { + "epoch": 8.337793636633958, + "grad_norm": 1.629269003868103, + "learning_rate": 1.6670630202140312e-06, + "loss": 0.3613, + "step": 3510 + }, + { + "epoch": 8.340172465060958, + "grad_norm": 1.6188087463378906, + "learning_rate": 1.6646848989298456e-06, + "loss": 0.281, + "step": 3511 + }, + { + "epoch": 8.342551293487958, + "grad_norm": 1.2869369983673096, + "learning_rate": 1.66230677764566e-06, + "loss": 0.2498, + "step": 3512 + }, + { + "epoch": 8.344930121914956, + "grad_norm": 1.7841954231262207, + "learning_rate": 1.6599286563614744e-06, + "loss": 0.4232, + "step": 3513 + }, + { + "epoch": 8.347308950341956, + "grad_norm": 1.6791640520095825, + "learning_rate": 1.657550535077289e-06, + "loss": 0.37, + "step": 3514 + }, + { + "epoch": 8.349687778768956, + "grad_norm": 1.4123343229293823, + "learning_rate": 1.6551724137931037e-06, + "loss": 0.3138, + "step": 3515 + }, + { + "epoch": 8.352066607195956, + "grad_norm": 1.5053707361221313, + "learning_rate": 1.652794292508918e-06, + "loss": 0.3642, + "step": 3516 + }, + { + "epoch": 8.354445435622956, + "grad_norm": 1.551558256149292, + "learning_rate": 1.6504161712247327e-06, + "loss": 0.3372, + "step": 3517 + }, + { + "epoch": 8.356824264049955, + "grad_norm": 1.352620005607605, + "learning_rate": 1.648038049940547e-06, + "loss": 0.3212, + "step": 3518 + }, + { + "epoch": 8.359203092476955, + "grad_norm": 1.5525131225585938, + "learning_rate": 1.6456599286563615e-06, + "loss": 0.3059, + "step": 3519 + }, + { + "epoch": 8.361581920903955, + "grad_norm": 1.2252603769302368, + "learning_rate": 1.6432818073721763e-06, + "loss": 0.2575, + "step": 3520 + }, + { + "epoch": 8.363960749330955, + "grad_norm": 1.4824069738388062, + "learning_rate": 1.6409036860879907e-06, + "loss": 0.3102, + "step": 3521 + }, + { + "epoch": 8.366339577757953, + "grad_norm": 1.392327070236206, + "learning_rate": 1.638525564803805e-06, + "loss": 0.2919, + "step": 3522 + }, + { + "epoch": 8.368718406184954, + "grad_norm": 1.5662305355072021, + "learning_rate": 1.6361474435196195e-06, + "loss": 0.3558, + "step": 3523 + }, + { + "epoch": 8.371097234611954, + "grad_norm": 1.7080914974212646, + "learning_rate": 1.6337693222354343e-06, + "loss": 0.3941, + "step": 3524 + }, + { + "epoch": 8.373476063038954, + "grad_norm": 1.3758834600448608, + "learning_rate": 1.6313912009512487e-06, + "loss": 0.2792, + "step": 3525 + }, + { + "epoch": 8.375854891465954, + "grad_norm": 1.6636333465576172, + "learning_rate": 1.6290130796670631e-06, + "loss": 0.3252, + "step": 3526 + }, + { + "epoch": 8.378233719892952, + "grad_norm": 1.7890195846557617, + "learning_rate": 1.6266349583828777e-06, + "loss": 0.3119, + "step": 3527 + }, + { + "epoch": 8.380612548319952, + "grad_norm": 1.590279221534729, + "learning_rate": 1.6242568370986921e-06, + "loss": 0.3032, + "step": 3528 + }, + { + "epoch": 8.382991376746952, + "grad_norm": 1.7103745937347412, + "learning_rate": 1.6218787158145065e-06, + "loss": 0.3433, + "step": 3529 + }, + { + "epoch": 8.385370205173952, + "grad_norm": 1.4384952783584595, + "learning_rate": 1.6195005945303211e-06, + "loss": 0.3198, + "step": 3530 + }, + { + "epoch": 8.38774903360095, + "grad_norm": 1.5636422634124756, + "learning_rate": 1.6171224732461357e-06, + "loss": 0.2822, + "step": 3531 + }, + { + "epoch": 8.390127862027951, + "grad_norm": 1.6979217529296875, + "learning_rate": 1.6147443519619501e-06, + "loss": 0.3539, + "step": 3532 + }, + { + "epoch": 8.392506690454951, + "grad_norm": 1.6413521766662598, + "learning_rate": 1.6123662306777645e-06, + "loss": 0.343, + "step": 3533 + }, + { + "epoch": 8.394885518881951, + "grad_norm": 1.506707787513733, + "learning_rate": 1.6099881093935794e-06, + "loss": 0.396, + "step": 3534 + }, + { + "epoch": 8.397264347308951, + "grad_norm": 1.8635413646697998, + "learning_rate": 1.6076099881093938e-06, + "loss": 0.3237, + "step": 3535 + }, + { + "epoch": 8.39964317573595, + "grad_norm": 1.723126769065857, + "learning_rate": 1.6052318668252082e-06, + "loss": 0.2962, + "step": 3536 + }, + { + "epoch": 8.40202200416295, + "grad_norm": 1.3965997695922852, + "learning_rate": 1.6028537455410228e-06, + "loss": 0.3053, + "step": 3537 + }, + { + "epoch": 8.40440083258995, + "grad_norm": 1.474820852279663, + "learning_rate": 1.6004756242568372e-06, + "loss": 0.3678, + "step": 3538 + }, + { + "epoch": 8.40677966101695, + "grad_norm": 1.7649742364883423, + "learning_rate": 1.5980975029726518e-06, + "loss": 0.3325, + "step": 3539 + }, + { + "epoch": 8.409158489443948, + "grad_norm": 1.5315227508544922, + "learning_rate": 1.5957193816884662e-06, + "loss": 0.2574, + "step": 3540 + }, + { + "epoch": 8.411537317870948, + "grad_norm": 1.5712000131607056, + "learning_rate": 1.5933412604042808e-06, + "loss": 0.2859, + "step": 3541 + }, + { + "epoch": 8.413916146297948, + "grad_norm": 1.5906555652618408, + "learning_rate": 1.5909631391200952e-06, + "loss": 0.3392, + "step": 3542 + }, + { + "epoch": 8.416294974724948, + "grad_norm": 1.5384715795516968, + "learning_rate": 1.5885850178359096e-06, + "loss": 0.3173, + "step": 3543 + }, + { + "epoch": 8.418673803151947, + "grad_norm": 1.5416871309280396, + "learning_rate": 1.5862068965517244e-06, + "loss": 0.3835, + "step": 3544 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 1.5811231136322021, + "learning_rate": 1.5838287752675388e-06, + "loss": 0.3461, + "step": 3545 + }, + { + "epoch": 8.423431460005947, + "grad_norm": 1.874444842338562, + "learning_rate": 1.5814506539833532e-06, + "loss": 0.3979, + "step": 3546 + }, + { + "epoch": 8.425810288432947, + "grad_norm": 1.4846525192260742, + "learning_rate": 1.5790725326991676e-06, + "loss": 0.3431, + "step": 3547 + }, + { + "epoch": 8.428189116859947, + "grad_norm": 1.5621412992477417, + "learning_rate": 1.5766944114149824e-06, + "loss": 0.2726, + "step": 3548 + }, + { + "epoch": 8.430567945286946, + "grad_norm": 1.5611345767974854, + "learning_rate": 1.5743162901307968e-06, + "loss": 0.3319, + "step": 3549 + }, + { + "epoch": 8.432946773713946, + "grad_norm": 1.620380163192749, + "learning_rate": 1.5719381688466112e-06, + "loss": 0.3717, + "step": 3550 + }, + { + "epoch": 8.432946773713946, + "eval_loss": 0.4265909492969513, + "eval_runtime": 22.498, + "eval_samples_per_second": 33.247, + "eval_steps_per_second": 16.624, + "step": 3550 + }, + { + "epoch": 8.435325602140946, + "grad_norm": 1.6972086429595947, + "learning_rate": 1.5695600475624258e-06, + "loss": 0.3382, + "step": 3551 + }, + { + "epoch": 8.437704430567946, + "grad_norm": 1.595907211303711, + "learning_rate": 1.5671819262782402e-06, + "loss": 0.3854, + "step": 3552 + }, + { + "epoch": 8.440083258994944, + "grad_norm": 1.6862574815750122, + "learning_rate": 1.5648038049940549e-06, + "loss": 0.322, + "step": 3553 + }, + { + "epoch": 8.442462087421944, + "grad_norm": 1.4338687658309937, + "learning_rate": 1.5624256837098695e-06, + "loss": 0.357, + "step": 3554 + }, + { + "epoch": 8.444840915848944, + "grad_norm": 1.3652828931808472, + "learning_rate": 1.5600475624256839e-06, + "loss": 0.2903, + "step": 3555 + }, + { + "epoch": 8.447219744275944, + "grad_norm": 1.5778048038482666, + "learning_rate": 1.5576694411414983e-06, + "loss": 0.2843, + "step": 3556 + }, + { + "epoch": 8.449598572702945, + "grad_norm": 1.7500706911087036, + "learning_rate": 1.5552913198573127e-06, + "loss": 0.4032, + "step": 3557 + }, + { + "epoch": 8.451977401129943, + "grad_norm": 1.4048631191253662, + "learning_rate": 1.5529131985731275e-06, + "loss": 0.3025, + "step": 3558 + }, + { + "epoch": 8.454356229556943, + "grad_norm": 1.336581826210022, + "learning_rate": 1.5505350772889419e-06, + "loss": 0.2761, + "step": 3559 + }, + { + "epoch": 8.456735057983943, + "grad_norm": 1.394282341003418, + "learning_rate": 1.5481569560047563e-06, + "loss": 0.3481, + "step": 3560 + }, + { + "epoch": 8.459113886410943, + "grad_norm": 1.5887014865875244, + "learning_rate": 1.545778834720571e-06, + "loss": 0.3143, + "step": 3561 + }, + { + "epoch": 8.461492714837942, + "grad_norm": 1.562636137008667, + "learning_rate": 1.5434007134363855e-06, + "loss": 0.3064, + "step": 3562 + }, + { + "epoch": 8.463871543264942, + "grad_norm": 1.5274467468261719, + "learning_rate": 1.5410225921522e-06, + "loss": 0.3337, + "step": 3563 + }, + { + "epoch": 8.466250371691942, + "grad_norm": 1.6055867671966553, + "learning_rate": 1.5386444708680143e-06, + "loss": 0.2863, + "step": 3564 + }, + { + "epoch": 8.468629200118942, + "grad_norm": 1.4416366815567017, + "learning_rate": 1.536266349583829e-06, + "loss": 0.3073, + "step": 3565 + }, + { + "epoch": 8.471008028545942, + "grad_norm": 1.351763129234314, + "learning_rate": 1.5338882282996433e-06, + "loss": 0.3126, + "step": 3566 + }, + { + "epoch": 8.47338685697294, + "grad_norm": 1.5317310094833374, + "learning_rate": 1.531510107015458e-06, + "loss": 0.3232, + "step": 3567 + }, + { + "epoch": 8.47576568539994, + "grad_norm": 1.326821208000183, + "learning_rate": 1.5291319857312725e-06, + "loss": 0.291, + "step": 3568 + }, + { + "epoch": 8.47814451382694, + "grad_norm": 1.6080410480499268, + "learning_rate": 1.526753864447087e-06, + "loss": 0.3348, + "step": 3569 + }, + { + "epoch": 8.48052334225394, + "grad_norm": 1.4980838298797607, + "learning_rate": 1.5243757431629013e-06, + "loss": 0.2728, + "step": 3570 + }, + { + "epoch": 8.482902170680939, + "grad_norm": 1.5367146730422974, + "learning_rate": 1.5219976218787157e-06, + "loss": 0.2659, + "step": 3571 + }, + { + "epoch": 8.485280999107939, + "grad_norm": 1.361850619316101, + "learning_rate": 1.5196195005945306e-06, + "loss": 0.2899, + "step": 3572 + }, + { + "epoch": 8.487659827534939, + "grad_norm": 1.6629457473754883, + "learning_rate": 1.517241379310345e-06, + "loss": 0.3213, + "step": 3573 + }, + { + "epoch": 8.49003865596194, + "grad_norm": 1.5463085174560547, + "learning_rate": 1.5148632580261594e-06, + "loss": 0.2969, + "step": 3574 + }, + { + "epoch": 8.49241748438894, + "grad_norm": 1.3932453393936157, + "learning_rate": 1.512485136741974e-06, + "loss": 0.3325, + "step": 3575 + }, + { + "epoch": 8.494796312815938, + "grad_norm": 1.3949167728424072, + "learning_rate": 1.5101070154577886e-06, + "loss": 0.2841, + "step": 3576 + }, + { + "epoch": 8.497175141242938, + "grad_norm": 1.6073777675628662, + "learning_rate": 1.507728894173603e-06, + "loss": 0.323, + "step": 3577 + }, + { + "epoch": 8.499553969669938, + "grad_norm": 1.5035643577575684, + "learning_rate": 1.5053507728894176e-06, + "loss": 0.3245, + "step": 3578 + }, + { + "epoch": 8.501932798096938, + "grad_norm": 1.6996068954467773, + "learning_rate": 1.502972651605232e-06, + "loss": 0.3366, + "step": 3579 + }, + { + "epoch": 8.504311626523936, + "grad_norm": 1.7336313724517822, + "learning_rate": 1.5005945303210464e-06, + "loss": 0.3763, + "step": 3580 + }, + { + "epoch": 8.506690454950936, + "grad_norm": 1.7124969959259033, + "learning_rate": 1.4982164090368608e-06, + "loss": 0.3396, + "step": 3581 + }, + { + "epoch": 8.509069283377936, + "grad_norm": 1.4800952672958374, + "learning_rate": 1.4958382877526756e-06, + "loss": 0.2808, + "step": 3582 + }, + { + "epoch": 8.511448111804937, + "grad_norm": 1.6486272811889648, + "learning_rate": 1.49346016646849e-06, + "loss": 0.3234, + "step": 3583 + }, + { + "epoch": 8.513826940231937, + "grad_norm": 1.6242011785507202, + "learning_rate": 1.4910820451843044e-06, + "loss": 0.3302, + "step": 3584 + }, + { + "epoch": 8.516205768658935, + "grad_norm": 1.567671775817871, + "learning_rate": 1.4887039239001192e-06, + "loss": 0.2724, + "step": 3585 + }, + { + "epoch": 8.518584597085935, + "grad_norm": 1.432282567024231, + "learning_rate": 1.4863258026159336e-06, + "loss": 0.3044, + "step": 3586 + }, + { + "epoch": 8.520963425512935, + "grad_norm": 1.6078349351882935, + "learning_rate": 1.483947681331748e-06, + "loss": 0.2984, + "step": 3587 + }, + { + "epoch": 8.523342253939935, + "grad_norm": 1.4639089107513428, + "learning_rate": 1.4815695600475624e-06, + "loss": 0.3176, + "step": 3588 + }, + { + "epoch": 8.525721082366934, + "grad_norm": 1.8479948043823242, + "learning_rate": 1.479191438763377e-06, + "loss": 0.336, + "step": 3589 + }, + { + "epoch": 8.528099910793934, + "grad_norm": 1.5971802473068237, + "learning_rate": 1.4768133174791916e-06, + "loss": 0.34, + "step": 3590 + }, + { + "epoch": 8.530478739220934, + "grad_norm": 1.6358247995376587, + "learning_rate": 1.474435196195006e-06, + "loss": 0.3803, + "step": 3591 + }, + { + "epoch": 8.532857567647934, + "grad_norm": 1.451765537261963, + "learning_rate": 1.4720570749108207e-06, + "loss": 0.3588, + "step": 3592 + }, + { + "epoch": 8.535236396074932, + "grad_norm": 1.339247226715088, + "learning_rate": 1.469678953626635e-06, + "loss": 0.2568, + "step": 3593 + }, + { + "epoch": 8.537615224501932, + "grad_norm": 1.815232276916504, + "learning_rate": 1.4673008323424495e-06, + "loss": 0.3478, + "step": 3594 + }, + { + "epoch": 8.539994052928932, + "grad_norm": 1.7567012310028076, + "learning_rate": 1.4649227110582643e-06, + "loss": 0.3332, + "step": 3595 + }, + { + "epoch": 8.542372881355933, + "grad_norm": 1.512372374534607, + "learning_rate": 1.4625445897740787e-06, + "loss": 0.3305, + "step": 3596 + }, + { + "epoch": 8.544751709782933, + "grad_norm": 1.6596437692642212, + "learning_rate": 1.460166468489893e-06, + "loss": 0.3451, + "step": 3597 + }, + { + "epoch": 8.547130538209931, + "grad_norm": 1.678159236907959, + "learning_rate": 1.4577883472057075e-06, + "loss": 0.3273, + "step": 3598 + }, + { + "epoch": 8.549509366636931, + "grad_norm": 1.6385680437088013, + "learning_rate": 1.4554102259215223e-06, + "loss": 0.3295, + "step": 3599 + }, + { + "epoch": 8.551888195063931, + "grad_norm": 1.434683084487915, + "learning_rate": 1.4530321046373367e-06, + "loss": 0.3653, + "step": 3600 + }, + { + "epoch": 8.551888195063931, + "eval_loss": 0.4272831380367279, + "eval_runtime": 22.4292, + "eval_samples_per_second": 33.349, + "eval_steps_per_second": 16.675, + "step": 3600 + }, + { + "epoch": 8.554267023490931, + "grad_norm": 1.5157699584960938, + "learning_rate": 1.450653983353151e-06, + "loss": 0.3348, + "step": 3601 + }, + { + "epoch": 8.55664585191793, + "grad_norm": 1.6775445938110352, + "learning_rate": 1.4482758620689657e-06, + "loss": 0.3558, + "step": 3602 + }, + { + "epoch": 8.55902468034493, + "grad_norm": 1.3175654411315918, + "learning_rate": 1.4458977407847801e-06, + "loss": 0.3211, + "step": 3603 + }, + { + "epoch": 8.56140350877193, + "grad_norm": 1.4896634817123413, + "learning_rate": 1.4435196195005945e-06, + "loss": 0.2844, + "step": 3604 + }, + { + "epoch": 8.56378233719893, + "grad_norm": 1.5746333599090576, + "learning_rate": 1.4411414982164091e-06, + "loss": 0.3166, + "step": 3605 + }, + { + "epoch": 8.56616116562593, + "grad_norm": 1.733944296836853, + "learning_rate": 1.4387633769322237e-06, + "loss": 0.3019, + "step": 3606 + }, + { + "epoch": 8.568539994052928, + "grad_norm": 1.2664188146591187, + "learning_rate": 1.4363852556480381e-06, + "loss": 0.2861, + "step": 3607 + }, + { + "epoch": 8.570918822479928, + "grad_norm": 1.9342091083526611, + "learning_rate": 1.4340071343638525e-06, + "loss": 0.4117, + "step": 3608 + }, + { + "epoch": 8.573297650906929, + "grad_norm": 1.391642451286316, + "learning_rate": 1.4316290130796673e-06, + "loss": 0.2931, + "step": 3609 + }, + { + "epoch": 8.575676479333929, + "grad_norm": 1.3977673053741455, + "learning_rate": 1.4292508917954817e-06, + "loss": 0.2981, + "step": 3610 + }, + { + "epoch": 8.578055307760927, + "grad_norm": 1.4301986694335938, + "learning_rate": 1.4268727705112961e-06, + "loss": 0.2736, + "step": 3611 + }, + { + "epoch": 8.580434136187927, + "grad_norm": 1.5012733936309814, + "learning_rate": 1.4244946492271108e-06, + "loss": 0.3829, + "step": 3612 + }, + { + "epoch": 8.582812964614927, + "grad_norm": 1.647625207901001, + "learning_rate": 1.4221165279429252e-06, + "loss": 0.3743, + "step": 3613 + }, + { + "epoch": 8.585191793041927, + "grad_norm": 1.534834384918213, + "learning_rate": 1.4197384066587398e-06, + "loss": 0.2951, + "step": 3614 + }, + { + "epoch": 8.587570621468927, + "grad_norm": 1.569266676902771, + "learning_rate": 1.4173602853745542e-06, + "loss": 0.3693, + "step": 3615 + }, + { + "epoch": 8.589949449895926, + "grad_norm": 1.6507121324539185, + "learning_rate": 1.4149821640903688e-06, + "loss": 0.3116, + "step": 3616 + }, + { + "epoch": 8.592328278322926, + "grad_norm": 1.4171403646469116, + "learning_rate": 1.4126040428061832e-06, + "loss": 0.283, + "step": 3617 + }, + { + "epoch": 8.594707106749926, + "grad_norm": 1.6506798267364502, + "learning_rate": 1.4102259215219976e-06, + "loss": 0.2798, + "step": 3618 + }, + { + "epoch": 8.597085935176926, + "grad_norm": 1.5668721199035645, + "learning_rate": 1.4078478002378124e-06, + "loss": 0.3848, + "step": 3619 + }, + { + "epoch": 8.599464763603924, + "grad_norm": 1.4375686645507812, + "learning_rate": 1.4054696789536268e-06, + "loss": 0.2905, + "step": 3620 + }, + { + "epoch": 8.601843592030924, + "grad_norm": 1.5572819709777832, + "learning_rate": 1.4030915576694412e-06, + "loss": 0.2907, + "step": 3621 + }, + { + "epoch": 8.604222420457925, + "grad_norm": 1.5237215757369995, + "learning_rate": 1.4007134363852556e-06, + "loss": 0.3202, + "step": 3622 + }, + { + "epoch": 8.606601248884925, + "grad_norm": 1.697910189628601, + "learning_rate": 1.3983353151010704e-06, + "loss": 0.3329, + "step": 3623 + }, + { + "epoch": 8.608980077311923, + "grad_norm": 1.5594604015350342, + "learning_rate": 1.3959571938168848e-06, + "loss": 0.3068, + "step": 3624 + }, + { + "epoch": 8.611358905738923, + "grad_norm": 1.6423453092575073, + "learning_rate": 1.3935790725326992e-06, + "loss": 0.369, + "step": 3625 + }, + { + "epoch": 8.613737734165923, + "grad_norm": 1.4400221109390259, + "learning_rate": 1.3912009512485138e-06, + "loss": 0.2687, + "step": 3626 + }, + { + "epoch": 8.616116562592923, + "grad_norm": 1.68343186378479, + "learning_rate": 1.3888228299643282e-06, + "loss": 0.3329, + "step": 3627 + }, + { + "epoch": 8.618495391019923, + "grad_norm": 1.4303144216537476, + "learning_rate": 1.3864447086801428e-06, + "loss": 0.3097, + "step": 3628 + }, + { + "epoch": 8.620874219446922, + "grad_norm": 1.7274521589279175, + "learning_rate": 1.3840665873959572e-06, + "loss": 0.3468, + "step": 3629 + }, + { + "epoch": 8.623253047873922, + "grad_norm": 1.4489414691925049, + "learning_rate": 1.3816884661117718e-06, + "loss": 0.3176, + "step": 3630 + }, + { + "epoch": 8.625631876300922, + "grad_norm": 1.5772809982299805, + "learning_rate": 1.3793103448275862e-06, + "loss": 0.2907, + "step": 3631 + }, + { + "epoch": 8.628010704727922, + "grad_norm": 1.5209261178970337, + "learning_rate": 1.3769322235434006e-06, + "loss": 0.3026, + "step": 3632 + }, + { + "epoch": 8.630389533154922, + "grad_norm": 1.8106515407562256, + "learning_rate": 1.3745541022592155e-06, + "loss": 0.2618, + "step": 3633 + }, + { + "epoch": 8.63276836158192, + "grad_norm": 1.4123209714889526, + "learning_rate": 1.3721759809750299e-06, + "loss": 0.3124, + "step": 3634 + }, + { + "epoch": 8.63514719000892, + "grad_norm": 1.4412957429885864, + "learning_rate": 1.3697978596908443e-06, + "loss": 0.3135, + "step": 3635 + }, + { + "epoch": 8.63752601843592, + "grad_norm": 1.4946672916412354, + "learning_rate": 1.3674197384066589e-06, + "loss": 0.3199, + "step": 3636 + }, + { + "epoch": 8.63990484686292, + "grad_norm": 1.538862943649292, + "learning_rate": 1.3650416171224735e-06, + "loss": 0.3095, + "step": 3637 + }, + { + "epoch": 8.642283675289919, + "grad_norm": 1.5616439580917358, + "learning_rate": 1.3626634958382879e-06, + "loss": 0.3777, + "step": 3638 + }, + { + "epoch": 8.64466250371692, + "grad_norm": 1.7152581214904785, + "learning_rate": 1.3602853745541023e-06, + "loss": 0.3743, + "step": 3639 + }, + { + "epoch": 8.64704133214392, + "grad_norm": 1.6123900413513184, + "learning_rate": 1.357907253269917e-06, + "loss": 0.2762, + "step": 3640 + }, + { + "epoch": 8.64942016057092, + "grad_norm": 1.6554999351501465, + "learning_rate": 1.3555291319857313e-06, + "loss": 0.347, + "step": 3641 + }, + { + "epoch": 8.651798988997918, + "grad_norm": 1.493951678276062, + "learning_rate": 1.353151010701546e-06, + "loss": 0.312, + "step": 3642 + }, + { + "epoch": 8.654177817424918, + "grad_norm": 1.4626973867416382, + "learning_rate": 1.3507728894173605e-06, + "loss": 0.319, + "step": 3643 + }, + { + "epoch": 8.656556645851918, + "grad_norm": 1.6232155561447144, + "learning_rate": 1.348394768133175e-06, + "loss": 0.2856, + "step": 3644 + }, + { + "epoch": 8.658935474278918, + "grad_norm": 1.4017575979232788, + "learning_rate": 1.3460166468489893e-06, + "loss": 0.3062, + "step": 3645 + }, + { + "epoch": 8.661314302705918, + "grad_norm": 1.6163498163223267, + "learning_rate": 1.3436385255648037e-06, + "loss": 0.3836, + "step": 3646 + }, + { + "epoch": 8.663693131132916, + "grad_norm": 1.523048996925354, + "learning_rate": 1.3412604042806185e-06, + "loss": 0.3253, + "step": 3647 + }, + { + "epoch": 8.666071959559916, + "grad_norm": 1.8122884035110474, + "learning_rate": 1.338882282996433e-06, + "loss": 0.3137, + "step": 3648 + }, + { + "epoch": 8.668450787986917, + "grad_norm": 1.6727081537246704, + "learning_rate": 1.3365041617122473e-06, + "loss": 0.3527, + "step": 3649 + }, + { + "epoch": 8.670829616413917, + "grad_norm": 1.5368080139160156, + "learning_rate": 1.334126040428062e-06, + "loss": 0.3027, + "step": 3650 + }, + { + "epoch": 8.670829616413917, + "eval_loss": 0.42632579803466797, + "eval_runtime": 23.1453, + "eval_samples_per_second": 32.318, + "eval_steps_per_second": 16.159, + "step": 3650 + }, + { + "epoch": 8.673208444840915, + "grad_norm": 1.6955238580703735, + "learning_rate": 1.3317479191438766e-06, + "loss": 0.3311, + "step": 3651 + }, + { + "epoch": 8.675587273267915, + "grad_norm": 1.8579959869384766, + "learning_rate": 1.329369797859691e-06, + "loss": 0.3686, + "step": 3652 + }, + { + "epoch": 8.677966101694915, + "grad_norm": 1.5098873376846313, + "learning_rate": 1.3269916765755056e-06, + "loss": 0.3174, + "step": 3653 + }, + { + "epoch": 8.680344930121915, + "grad_norm": 1.3667107820510864, + "learning_rate": 1.32461355529132e-06, + "loss": 0.2848, + "step": 3654 + }, + { + "epoch": 8.682723758548915, + "grad_norm": 1.6322307586669922, + "learning_rate": 1.3222354340071344e-06, + "loss": 0.3422, + "step": 3655 + }, + { + "epoch": 8.685102586975914, + "grad_norm": 1.743981122970581, + "learning_rate": 1.3198573127229488e-06, + "loss": 0.3158, + "step": 3656 + }, + { + "epoch": 8.687481415402914, + "grad_norm": 1.4204984903335571, + "learning_rate": 1.3174791914387636e-06, + "loss": 0.2796, + "step": 3657 + }, + { + "epoch": 8.689860243829914, + "grad_norm": 1.9110546112060547, + "learning_rate": 1.315101070154578e-06, + "loss": 0.3817, + "step": 3658 + }, + { + "epoch": 8.692239072256914, + "grad_norm": 1.6266381740570068, + "learning_rate": 1.3127229488703924e-06, + "loss": 0.3044, + "step": 3659 + }, + { + "epoch": 8.694617900683912, + "grad_norm": 1.673478364944458, + "learning_rate": 1.3103448275862072e-06, + "loss": 0.369, + "step": 3660 + }, + { + "epoch": 8.696996729110912, + "grad_norm": 1.5132158994674683, + "learning_rate": 1.3079667063020216e-06, + "loss": 0.3374, + "step": 3661 + }, + { + "epoch": 8.699375557537913, + "grad_norm": 1.7466073036193848, + "learning_rate": 1.305588585017836e-06, + "loss": 0.3577, + "step": 3662 + }, + { + "epoch": 8.701754385964913, + "grad_norm": 1.6330883502960205, + "learning_rate": 1.3032104637336504e-06, + "loss": 0.3232, + "step": 3663 + }, + { + "epoch": 8.704133214391913, + "grad_norm": 1.5312587022781372, + "learning_rate": 1.300832342449465e-06, + "loss": 0.3231, + "step": 3664 + }, + { + "epoch": 8.706512042818911, + "grad_norm": 1.656788945198059, + "learning_rate": 1.2984542211652796e-06, + "loss": 0.3448, + "step": 3665 + }, + { + "epoch": 8.708890871245911, + "grad_norm": 1.428627371788025, + "learning_rate": 1.296076099881094e-06, + "loss": 0.3289, + "step": 3666 + }, + { + "epoch": 8.711269699672911, + "grad_norm": 1.7328656911849976, + "learning_rate": 1.2936979785969086e-06, + "loss": 0.3512, + "step": 3667 + }, + { + "epoch": 8.713648528099911, + "grad_norm": 1.495658040046692, + "learning_rate": 1.291319857312723e-06, + "loss": 0.3151, + "step": 3668 + }, + { + "epoch": 8.71602735652691, + "grad_norm": 1.6852049827575684, + "learning_rate": 1.2889417360285374e-06, + "loss": 0.3459, + "step": 3669 + }, + { + "epoch": 8.71840618495391, + "grad_norm": 1.3396629095077515, + "learning_rate": 1.2865636147443518e-06, + "loss": 0.2387, + "step": 3670 + }, + { + "epoch": 8.72078501338091, + "grad_norm": 1.3220832347869873, + "learning_rate": 1.2841854934601667e-06, + "loss": 0.257, + "step": 3671 + }, + { + "epoch": 8.72316384180791, + "grad_norm": 1.5728422403335571, + "learning_rate": 1.281807372175981e-06, + "loss": 0.2713, + "step": 3672 + }, + { + "epoch": 8.725542670234908, + "grad_norm": 1.801188349723816, + "learning_rate": 1.2794292508917955e-06, + "loss": 0.3624, + "step": 3673 + }, + { + "epoch": 8.727921498661908, + "grad_norm": 1.3762673139572144, + "learning_rate": 1.2770511296076103e-06, + "loss": 0.2836, + "step": 3674 + }, + { + "epoch": 8.730300327088909, + "grad_norm": 1.523062825202942, + "learning_rate": 1.2746730083234247e-06, + "loss": 0.2798, + "step": 3675 + }, + { + "epoch": 8.732679155515909, + "grad_norm": 1.5326848030090332, + "learning_rate": 1.272294887039239e-06, + "loss": 0.3589, + "step": 3676 + }, + { + "epoch": 8.735057983942909, + "grad_norm": 1.5912402868270874, + "learning_rate": 1.2699167657550537e-06, + "loss": 0.3336, + "step": 3677 + }, + { + "epoch": 8.737436812369907, + "grad_norm": 1.390525221824646, + "learning_rate": 1.267538644470868e-06, + "loss": 0.2892, + "step": 3678 + }, + { + "epoch": 8.739815640796907, + "grad_norm": 1.6426126956939697, + "learning_rate": 1.2651605231866825e-06, + "loss": 0.2584, + "step": 3679 + }, + { + "epoch": 8.742194469223907, + "grad_norm": 1.402677059173584, + "learning_rate": 1.262782401902497e-06, + "loss": 0.2872, + "step": 3680 + }, + { + "epoch": 8.744573297650907, + "grad_norm": 1.6991604566574097, + "learning_rate": 1.2604042806183117e-06, + "loss": 0.4061, + "step": 3681 + }, + { + "epoch": 8.746952126077908, + "grad_norm": 1.651569128036499, + "learning_rate": 1.2580261593341261e-06, + "loss": 0.3246, + "step": 3682 + }, + { + "epoch": 8.749330954504906, + "grad_norm": 1.4228763580322266, + "learning_rate": 1.2556480380499405e-06, + "loss": 0.3328, + "step": 3683 + }, + { + "epoch": 8.751709782931906, + "grad_norm": 1.457672357559204, + "learning_rate": 1.2532699167657553e-06, + "loss": 0.3416, + "step": 3684 + }, + { + "epoch": 8.754088611358906, + "grad_norm": 1.5210474729537964, + "learning_rate": 1.2508917954815697e-06, + "loss": 0.308, + "step": 3685 + }, + { + "epoch": 8.756467439785906, + "grad_norm": 1.7006677389144897, + "learning_rate": 1.2485136741973841e-06, + "loss": 0.4011, + "step": 3686 + }, + { + "epoch": 8.758846268212904, + "grad_norm": 1.3762685060501099, + "learning_rate": 1.2461355529131987e-06, + "loss": 0.3163, + "step": 3687 + }, + { + "epoch": 8.761225096639905, + "grad_norm": 1.5553098917007446, + "learning_rate": 1.2437574316290133e-06, + "loss": 0.3175, + "step": 3688 + }, + { + "epoch": 8.763603925066905, + "grad_norm": 1.4174240827560425, + "learning_rate": 1.2413793103448277e-06, + "loss": 0.3157, + "step": 3689 + }, + { + "epoch": 8.765982753493905, + "grad_norm": 1.4997735023498535, + "learning_rate": 1.2390011890606421e-06, + "loss": 0.259, + "step": 3690 + }, + { + "epoch": 8.768361581920903, + "grad_norm": 1.6237826347351074, + "learning_rate": 1.2366230677764568e-06, + "loss": 0.3287, + "step": 3691 + }, + { + "epoch": 8.770740410347903, + "grad_norm": 1.378442645072937, + "learning_rate": 1.2342449464922712e-06, + "loss": 0.3566, + "step": 3692 + }, + { + "epoch": 8.773119238774903, + "grad_norm": 1.662424087524414, + "learning_rate": 1.2318668252080856e-06, + "loss": 0.3392, + "step": 3693 + }, + { + "epoch": 8.775498067201903, + "grad_norm": 1.8789523839950562, + "learning_rate": 1.2294887039239002e-06, + "loss": 0.3757, + "step": 3694 + }, + { + "epoch": 8.777876895628903, + "grad_norm": 1.4330552816390991, + "learning_rate": 1.2271105826397148e-06, + "loss": 0.313, + "step": 3695 + }, + { + "epoch": 8.780255724055902, + "grad_norm": 1.7691562175750732, + "learning_rate": 1.2247324613555292e-06, + "loss": 0.3026, + "step": 3696 + }, + { + "epoch": 8.782634552482902, + "grad_norm": 1.6917610168457031, + "learning_rate": 1.2223543400713438e-06, + "loss": 0.3322, + "step": 3697 + }, + { + "epoch": 8.785013380909902, + "grad_norm": 1.5320965051651, + "learning_rate": 1.2199762187871582e-06, + "loss": 0.3432, + "step": 3698 + }, + { + "epoch": 8.787392209336902, + "grad_norm": 1.7975645065307617, + "learning_rate": 1.2175980975029728e-06, + "loss": 0.3743, + "step": 3699 + }, + { + "epoch": 8.7897710377639, + "grad_norm": 1.3942697048187256, + "learning_rate": 1.2152199762187874e-06, + "loss": 0.2504, + "step": 3700 + }, + { + "epoch": 8.7897710377639, + "eval_loss": 0.4259503185749054, + "eval_runtime": 22.5939, + "eval_samples_per_second": 33.106, + "eval_steps_per_second": 16.553, + "step": 3700 + }, + { + "epoch": 8.7921498661909, + "grad_norm": 1.9074801206588745, + "learning_rate": 1.2128418549346018e-06, + "loss": 0.412, + "step": 3701 + }, + { + "epoch": 8.7945286946179, + "grad_norm": 1.533901333808899, + "learning_rate": 1.2104637336504162e-06, + "loss": 0.2333, + "step": 3702 + }, + { + "epoch": 8.7969075230449, + "grad_norm": 1.350075125694275, + "learning_rate": 1.2080856123662308e-06, + "loss": 0.2685, + "step": 3703 + }, + { + "epoch": 8.7992863514719, + "grad_norm": 1.6373611688613892, + "learning_rate": 1.2057074910820452e-06, + "loss": 0.3072, + "step": 3704 + }, + { + "epoch": 8.8016651798989, + "grad_norm": 1.4643265008926392, + "learning_rate": 1.2033293697978596e-06, + "loss": 0.3533, + "step": 3705 + }, + { + "epoch": 8.8040440083259, + "grad_norm": 1.589480996131897, + "learning_rate": 1.2009512485136742e-06, + "loss": 0.2934, + "step": 3706 + }, + { + "epoch": 8.8064228367529, + "grad_norm": 1.5760538578033447, + "learning_rate": 1.1985731272294888e-06, + "loss": 0.3421, + "step": 3707 + }, + { + "epoch": 8.8088016651799, + "grad_norm": 1.4997698068618774, + "learning_rate": 1.1961950059453032e-06, + "loss": 0.2873, + "step": 3708 + }, + { + "epoch": 8.811180493606898, + "grad_norm": 1.3008885383605957, + "learning_rate": 1.1938168846611178e-06, + "loss": 0.2936, + "step": 3709 + }, + { + "epoch": 8.813559322033898, + "grad_norm": 1.5464078187942505, + "learning_rate": 1.1914387633769322e-06, + "loss": 0.2922, + "step": 3710 + }, + { + "epoch": 8.815938150460898, + "grad_norm": 1.6612602472305298, + "learning_rate": 1.1890606420927469e-06, + "loss": 0.2964, + "step": 3711 + }, + { + "epoch": 8.818316978887898, + "grad_norm": 1.399171233177185, + "learning_rate": 1.1866825208085615e-06, + "loss": 0.3344, + "step": 3712 + }, + { + "epoch": 8.820695807314898, + "grad_norm": 1.6280171871185303, + "learning_rate": 1.1843043995243759e-06, + "loss": 0.3061, + "step": 3713 + }, + { + "epoch": 8.823074635741897, + "grad_norm": 1.5112230777740479, + "learning_rate": 1.1819262782401905e-06, + "loss": 0.3427, + "step": 3714 + }, + { + "epoch": 8.825453464168897, + "grad_norm": 1.4831987619400024, + "learning_rate": 1.1795481569560049e-06, + "loss": 0.247, + "step": 3715 + }, + { + "epoch": 8.827832292595897, + "grad_norm": 1.600806474685669, + "learning_rate": 1.1771700356718193e-06, + "loss": 0.3193, + "step": 3716 + }, + { + "epoch": 8.830211121022897, + "grad_norm": 1.3805359601974487, + "learning_rate": 1.1747919143876339e-06, + "loss": 0.3311, + "step": 3717 + }, + { + "epoch": 8.832589949449895, + "grad_norm": 1.4858237504959106, + "learning_rate": 1.1724137931034483e-06, + "loss": 0.2964, + "step": 3718 + }, + { + "epoch": 8.834968777876895, + "grad_norm": 1.6069543361663818, + "learning_rate": 1.170035671819263e-06, + "loss": 0.2943, + "step": 3719 + }, + { + "epoch": 8.837347606303895, + "grad_norm": 1.6131019592285156, + "learning_rate": 1.1676575505350773e-06, + "loss": 0.3301, + "step": 3720 + }, + { + "epoch": 8.839726434730895, + "grad_norm": 1.4386063814163208, + "learning_rate": 1.165279429250892e-06, + "loss": 0.3641, + "step": 3721 + }, + { + "epoch": 8.842105263157894, + "grad_norm": 1.649227261543274, + "learning_rate": 1.1629013079667063e-06, + "loss": 0.3604, + "step": 3722 + }, + { + "epoch": 8.844484091584894, + "grad_norm": 1.6608614921569824, + "learning_rate": 1.160523186682521e-06, + "loss": 0.4344, + "step": 3723 + }, + { + "epoch": 8.846862920011894, + "grad_norm": 1.6867293119430542, + "learning_rate": 1.1581450653983355e-06, + "loss": 0.3292, + "step": 3724 + }, + { + "epoch": 8.849241748438894, + "grad_norm": 1.4168503284454346, + "learning_rate": 1.15576694411415e-06, + "loss": 0.2326, + "step": 3725 + }, + { + "epoch": 8.851620576865894, + "grad_norm": 1.4876222610473633, + "learning_rate": 1.1533888228299645e-06, + "loss": 0.286, + "step": 3726 + }, + { + "epoch": 8.853999405292893, + "grad_norm": 1.6132135391235352, + "learning_rate": 1.151010701545779e-06, + "loss": 0.3182, + "step": 3727 + }, + { + "epoch": 8.856378233719893, + "grad_norm": 1.4098435640335083, + "learning_rate": 1.1486325802615933e-06, + "loss": 0.2751, + "step": 3728 + }, + { + "epoch": 8.858757062146893, + "grad_norm": 1.7248328924179077, + "learning_rate": 1.146254458977408e-06, + "loss": 0.4025, + "step": 3729 + }, + { + "epoch": 8.861135890573893, + "grad_norm": 1.8257653713226318, + "learning_rate": 1.1438763376932223e-06, + "loss": 0.3779, + "step": 3730 + }, + { + "epoch": 8.863514719000893, + "grad_norm": 1.859602451324463, + "learning_rate": 1.141498216409037e-06, + "loss": 0.4367, + "step": 3731 + }, + { + "epoch": 8.865893547427891, + "grad_norm": 1.375943899154663, + "learning_rate": 1.1391200951248514e-06, + "loss": 0.348, + "step": 3732 + }, + { + "epoch": 8.868272375854891, + "grad_norm": 1.8097281455993652, + "learning_rate": 1.136741973840666e-06, + "loss": 0.3334, + "step": 3733 + }, + { + "epoch": 8.870651204281891, + "grad_norm": 1.3355309963226318, + "learning_rate": 1.1343638525564804e-06, + "loss": 0.3047, + "step": 3734 + }, + { + "epoch": 8.873030032708892, + "grad_norm": 1.8775721788406372, + "learning_rate": 1.131985731272295e-06, + "loss": 0.3586, + "step": 3735 + }, + { + "epoch": 8.87540886113589, + "grad_norm": 1.7046539783477783, + "learning_rate": 1.1296076099881096e-06, + "loss": 0.3897, + "step": 3736 + }, + { + "epoch": 8.87778768956289, + "grad_norm": 1.5617187023162842, + "learning_rate": 1.127229488703924e-06, + "loss": 0.2896, + "step": 3737 + }, + { + "epoch": 8.88016651798989, + "grad_norm": 1.6768876314163208, + "learning_rate": 1.1248513674197386e-06, + "loss": 0.3036, + "step": 3738 + }, + { + "epoch": 8.88254534641689, + "grad_norm": 1.3780255317687988, + "learning_rate": 1.122473246135553e-06, + "loss": 0.2766, + "step": 3739 + }, + { + "epoch": 8.884924174843889, + "grad_norm": 1.5731207132339478, + "learning_rate": 1.1200951248513676e-06, + "loss": 0.3287, + "step": 3740 + }, + { + "epoch": 8.887303003270889, + "grad_norm": 1.421142339706421, + "learning_rate": 1.117717003567182e-06, + "loss": 0.3096, + "step": 3741 + }, + { + "epoch": 8.889681831697889, + "grad_norm": 1.5445897579193115, + "learning_rate": 1.1153388822829964e-06, + "loss": 0.332, + "step": 3742 + }, + { + "epoch": 8.892060660124889, + "grad_norm": 1.621748924255371, + "learning_rate": 1.112960760998811e-06, + "loss": 0.3277, + "step": 3743 + }, + { + "epoch": 8.894439488551889, + "grad_norm": 1.4268516302108765, + "learning_rate": 1.1105826397146254e-06, + "loss": 0.293, + "step": 3744 + }, + { + "epoch": 8.896818316978887, + "grad_norm": 1.5215725898742676, + "learning_rate": 1.10820451843044e-06, + "loss": 0.3936, + "step": 3745 + }, + { + "epoch": 8.899197145405887, + "grad_norm": 1.5274910926818848, + "learning_rate": 1.1058263971462546e-06, + "loss": 0.2835, + "step": 3746 + }, + { + "epoch": 8.901575973832887, + "grad_norm": 1.3766536712646484, + "learning_rate": 1.103448275862069e-06, + "loss": 0.3017, + "step": 3747 + }, + { + "epoch": 8.903954802259888, + "grad_norm": 1.5471895933151245, + "learning_rate": 1.1010701545778837e-06, + "loss": 0.273, + "step": 3748 + }, + { + "epoch": 8.906333630686886, + "grad_norm": 1.6966437101364136, + "learning_rate": 1.098692033293698e-06, + "loss": 0.3488, + "step": 3749 + }, + { + "epoch": 8.908712459113886, + "grad_norm": 1.7242950201034546, + "learning_rate": 1.0963139120095127e-06, + "loss": 0.3374, + "step": 3750 + }, + { + "epoch": 8.908712459113886, + "eval_loss": 0.42578697204589844, + "eval_runtime": 22.6841, + "eval_samples_per_second": 32.975, + "eval_steps_per_second": 16.487, + "step": 3750 + }, + { + "epoch": 8.911091287540886, + "grad_norm": 1.406058669090271, + "learning_rate": 1.093935790725327e-06, + "loss": 0.3219, + "step": 3751 + }, + { + "epoch": 8.913470115967886, + "grad_norm": 1.6567151546478271, + "learning_rate": 1.0915576694411417e-06, + "loss": 0.3707, + "step": 3752 + }, + { + "epoch": 8.915848944394886, + "grad_norm": 1.5412391424179077, + "learning_rate": 1.089179548156956e-06, + "loss": 0.3444, + "step": 3753 + }, + { + "epoch": 8.918227772821885, + "grad_norm": 1.5075544118881226, + "learning_rate": 1.0868014268727705e-06, + "loss": 0.3293, + "step": 3754 + }, + { + "epoch": 8.920606601248885, + "grad_norm": 1.5869563817977905, + "learning_rate": 1.084423305588585e-06, + "loss": 0.3252, + "step": 3755 + }, + { + "epoch": 8.922985429675885, + "grad_norm": 1.4815106391906738, + "learning_rate": 1.0820451843043995e-06, + "loss": 0.3107, + "step": 3756 + }, + { + "epoch": 8.925364258102885, + "grad_norm": 1.678187370300293, + "learning_rate": 1.079667063020214e-06, + "loss": 0.3045, + "step": 3757 + }, + { + "epoch": 8.927743086529883, + "grad_norm": 1.767537236213684, + "learning_rate": 1.0772889417360287e-06, + "loss": 0.3227, + "step": 3758 + }, + { + "epoch": 8.930121914956883, + "grad_norm": 1.7505519390106201, + "learning_rate": 1.074910820451843e-06, + "loss": 0.3202, + "step": 3759 + }, + { + "epoch": 8.932500743383883, + "grad_norm": 1.5290918350219727, + "learning_rate": 1.0725326991676577e-06, + "loss": 0.3359, + "step": 3760 + }, + { + "epoch": 8.934879571810884, + "grad_norm": 1.458198070526123, + "learning_rate": 1.0701545778834721e-06, + "loss": 0.3555, + "step": 3761 + }, + { + "epoch": 8.937258400237884, + "grad_norm": 1.5466161966323853, + "learning_rate": 1.0677764565992867e-06, + "loss": 0.2781, + "step": 3762 + }, + { + "epoch": 8.939637228664882, + "grad_norm": 1.6552780866622925, + "learning_rate": 1.0653983353151011e-06, + "loss": 0.3067, + "step": 3763 + }, + { + "epoch": 8.942016057091882, + "grad_norm": 1.5407432317733765, + "learning_rate": 1.0630202140309157e-06, + "loss": 0.3062, + "step": 3764 + }, + { + "epoch": 8.944394885518882, + "grad_norm": 1.377998948097229, + "learning_rate": 1.0606420927467301e-06, + "loss": 0.2663, + "step": 3765 + }, + { + "epoch": 8.946773713945882, + "grad_norm": 1.647327184677124, + "learning_rate": 1.0582639714625447e-06, + "loss": 0.3136, + "step": 3766 + }, + { + "epoch": 8.94915254237288, + "grad_norm": 1.3415474891662598, + "learning_rate": 1.0558858501783591e-06, + "loss": 0.3098, + "step": 3767 + }, + { + "epoch": 8.95153137079988, + "grad_norm": 1.6281086206436157, + "learning_rate": 1.0535077288941735e-06, + "loss": 0.3539, + "step": 3768 + }, + { + "epoch": 8.95391019922688, + "grad_norm": 1.5496877431869507, + "learning_rate": 1.0511296076099881e-06, + "loss": 0.3422, + "step": 3769 + }, + { + "epoch": 8.956289027653881, + "grad_norm": 1.607143759727478, + "learning_rate": 1.0487514863258028e-06, + "loss": 0.3086, + "step": 3770 + }, + { + "epoch": 8.95866785608088, + "grad_norm": 1.4399901628494263, + "learning_rate": 1.0463733650416172e-06, + "loss": 0.3289, + "step": 3771 + }, + { + "epoch": 8.96104668450788, + "grad_norm": 1.4371954202651978, + "learning_rate": 1.0439952437574318e-06, + "loss": 0.3496, + "step": 3772 + }, + { + "epoch": 8.96342551293488, + "grad_norm": 1.5387896299362183, + "learning_rate": 1.0416171224732462e-06, + "loss": 0.2852, + "step": 3773 + }, + { + "epoch": 8.96580434136188, + "grad_norm": 1.663767695426941, + "learning_rate": 1.0392390011890608e-06, + "loss": 0.3818, + "step": 3774 + }, + { + "epoch": 8.96818316978888, + "grad_norm": 1.649552583694458, + "learning_rate": 1.0368608799048754e-06, + "loss": 0.3401, + "step": 3775 + }, + { + "epoch": 8.970561998215878, + "grad_norm": 1.468848705291748, + "learning_rate": 1.0344827586206898e-06, + "loss": 0.3317, + "step": 3776 + }, + { + "epoch": 8.972940826642878, + "grad_norm": 1.656581163406372, + "learning_rate": 1.0321046373365042e-06, + "loss": 0.3284, + "step": 3777 + }, + { + "epoch": 8.975319655069878, + "grad_norm": 1.5970196723937988, + "learning_rate": 1.0297265160523188e-06, + "loss": 0.2939, + "step": 3778 + }, + { + "epoch": 8.977698483496878, + "grad_norm": 1.5218843221664429, + "learning_rate": 1.0273483947681332e-06, + "loss": 0.2994, + "step": 3779 + }, + { + "epoch": 8.980077311923878, + "grad_norm": 1.5507384538650513, + "learning_rate": 1.0249702734839476e-06, + "loss": 0.2699, + "step": 3780 + }, + { + "epoch": 8.982456140350877, + "grad_norm": 1.5033663511276245, + "learning_rate": 1.0225921521997622e-06, + "loss": 0.316, + "step": 3781 + }, + { + "epoch": 8.984834968777877, + "grad_norm": 1.6011765003204346, + "learning_rate": 1.0202140309155768e-06, + "loss": 0.3321, + "step": 3782 + }, + { + "epoch": 8.987213797204877, + "grad_norm": 1.6613545417785645, + "learning_rate": 1.0178359096313912e-06, + "loss": 0.3131, + "step": 3783 + }, + { + "epoch": 8.989592625631877, + "grad_norm": 1.7563362121582031, + "learning_rate": 1.0154577883472058e-06, + "loss": 0.3169, + "step": 3784 + }, + { + "epoch": 8.991971454058875, + "grad_norm": 1.4740163087844849, + "learning_rate": 1.0130796670630202e-06, + "loss": 0.3067, + "step": 3785 + }, + { + "epoch": 8.994350282485875, + "grad_norm": 1.4442068338394165, + "learning_rate": 1.0107015457788348e-06, + "loss": 0.3431, + "step": 3786 + }, + { + "epoch": 8.996729110912876, + "grad_norm": 1.5819159746170044, + "learning_rate": 1.0083234244946495e-06, + "loss": 0.336, + "step": 3787 + }, + { + "epoch": 8.999107939339876, + "grad_norm": 1.4820237159729004, + "learning_rate": 1.0059453032104639e-06, + "loss": 0.2885, + "step": 3788 + }, + { + "epoch": 9.0, + "grad_norm": 2.8606104850769043, + "learning_rate": 1.0035671819262785e-06, + "loss": 0.4066, + "step": 3789 + }, + { + "epoch": 9.002378828427, + "grad_norm": 1.4479261636734009, + "learning_rate": 1.0011890606420929e-06, + "loss": 0.2986, + "step": 3790 + }, + { + "epoch": 9.004757656854, + "grad_norm": 1.7532469034194946, + "learning_rate": 9.988109393579073e-07, + "loss": 0.3533, + "step": 3791 + }, + { + "epoch": 9.007136485280999, + "grad_norm": 1.531294822692871, + "learning_rate": 9.964328180737219e-07, + "loss": 0.3543, + "step": 3792 + }, + { + "epoch": 9.009515313707999, + "grad_norm": 1.5818750858306885, + "learning_rate": 9.940546967895363e-07, + "loss": 0.3037, + "step": 3793 + }, + { + "epoch": 9.011894142134999, + "grad_norm": 1.332526445388794, + "learning_rate": 9.916765755053509e-07, + "loss": 0.2477, + "step": 3794 + }, + { + "epoch": 9.014272970561999, + "grad_norm": 1.6225389242172241, + "learning_rate": 9.892984542211653e-07, + "loss": 0.3101, + "step": 3795 + }, + { + "epoch": 9.016651798988997, + "grad_norm": 1.4937560558319092, + "learning_rate": 9.869203329369799e-07, + "loss": 0.3201, + "step": 3796 + }, + { + "epoch": 9.019030627415997, + "grad_norm": 1.4562296867370605, + "learning_rate": 9.845422116527943e-07, + "loss": 0.3684, + "step": 3797 + }, + { + "epoch": 9.021409455842997, + "grad_norm": 1.5598881244659424, + "learning_rate": 9.82164090368609e-07, + "loss": 0.3335, + "step": 3798 + }, + { + "epoch": 9.023788284269997, + "grad_norm": 1.6649165153503418, + "learning_rate": 9.797859690844235e-07, + "loss": 0.3649, + "step": 3799 + }, + { + "epoch": 9.026167112696998, + "grad_norm": 1.7617136240005493, + "learning_rate": 9.77407847800238e-07, + "loss": 0.3183, + "step": 3800 + }, + { + "epoch": 9.026167112696998, + "eval_loss": 0.4263642728328705, + "eval_runtime": 22.7158, + "eval_samples_per_second": 32.929, + "eval_steps_per_second": 16.464, + "step": 3800 + }, + { + "epoch": 9.028545941123996, + "grad_norm": 1.641108512878418, + "learning_rate": 9.750297265160525e-07, + "loss": 0.3254, + "step": 3801 + }, + { + "epoch": 9.030924769550996, + "grad_norm": 1.4608268737792969, + "learning_rate": 9.72651605231867e-07, + "loss": 0.2797, + "step": 3802 + }, + { + "epoch": 9.033303597977996, + "grad_norm": 1.62559974193573, + "learning_rate": 9.702734839476813e-07, + "loss": 0.3644, + "step": 3803 + }, + { + "epoch": 9.035682426404996, + "grad_norm": 1.608381748199463, + "learning_rate": 9.67895362663496e-07, + "loss": 0.351, + "step": 3804 + }, + { + "epoch": 9.038061254831995, + "grad_norm": 1.402420997619629, + "learning_rate": 9.655172413793103e-07, + "loss": 0.2695, + "step": 3805 + }, + { + "epoch": 9.040440083258995, + "grad_norm": 1.4004439115524292, + "learning_rate": 9.63139120095125e-07, + "loss": 0.3143, + "step": 3806 + }, + { + "epoch": 9.042818911685995, + "grad_norm": 1.495635986328125, + "learning_rate": 9.607609988109393e-07, + "loss": 0.2971, + "step": 3807 + }, + { + "epoch": 9.045197740112995, + "grad_norm": 1.6451293230056763, + "learning_rate": 9.58382877526754e-07, + "loss": 0.2916, + "step": 3808 + }, + { + "epoch": 9.047576568539995, + "grad_norm": 1.597826361656189, + "learning_rate": 9.560047562425684e-07, + "loss": 0.3017, + "step": 3809 + }, + { + "epoch": 9.049955396966993, + "grad_norm": 1.542311429977417, + "learning_rate": 9.53626634958383e-07, + "loss": 0.2608, + "step": 3810 + }, + { + "epoch": 9.052334225393993, + "grad_norm": 1.4700878858566284, + "learning_rate": 9.512485136741975e-07, + "loss": 0.283, + "step": 3811 + }, + { + "epoch": 9.054713053820993, + "grad_norm": 1.6608840227127075, + "learning_rate": 9.48870392390012e-07, + "loss": 0.3211, + "step": 3812 + }, + { + "epoch": 9.057091882247994, + "grad_norm": 1.5146536827087402, + "learning_rate": 9.464922711058265e-07, + "loss": 0.3268, + "step": 3813 + }, + { + "epoch": 9.059470710674992, + "grad_norm": 1.6262614727020264, + "learning_rate": 9.44114149821641e-07, + "loss": 0.2998, + "step": 3814 + }, + { + "epoch": 9.061849539101992, + "grad_norm": 1.3333219289779663, + "learning_rate": 9.417360285374555e-07, + "loss": 0.3012, + "step": 3815 + }, + { + "epoch": 9.064228367528992, + "grad_norm": 1.5682077407836914, + "learning_rate": 9.393579072532701e-07, + "loss": 0.3243, + "step": 3816 + }, + { + "epoch": 9.066607195955992, + "grad_norm": 1.7033385038375854, + "learning_rate": 9.369797859690845e-07, + "loss": 0.3775, + "step": 3817 + }, + { + "epoch": 9.06898602438299, + "grad_norm": 1.7697062492370605, + "learning_rate": 9.34601664684899e-07, + "loss": 0.3203, + "step": 3818 + }, + { + "epoch": 9.07136485280999, + "grad_norm": 1.5809648036956787, + "learning_rate": 9.322235434007135e-07, + "loss": 0.2732, + "step": 3819 + }, + { + "epoch": 9.07374368123699, + "grad_norm": 1.5714980363845825, + "learning_rate": 9.29845422116528e-07, + "loss": 0.3972, + "step": 3820 + }, + { + "epoch": 9.07612250966399, + "grad_norm": 1.735870599746704, + "learning_rate": 9.274673008323424e-07, + "loss": 0.2953, + "step": 3821 + }, + { + "epoch": 9.078501338090991, + "grad_norm": 1.5096570253372192, + "learning_rate": 9.25089179548157e-07, + "loss": 0.3157, + "step": 3822 + }, + { + "epoch": 9.08088016651799, + "grad_norm": 1.7356563806533813, + "learning_rate": 9.227110582639716e-07, + "loss": 0.3269, + "step": 3823 + }, + { + "epoch": 9.08325899494499, + "grad_norm": 1.6344674825668335, + "learning_rate": 9.20332936979786e-07, + "loss": 0.3219, + "step": 3824 + }, + { + "epoch": 9.08563782337199, + "grad_norm": 1.797797441482544, + "learning_rate": 9.179548156956005e-07, + "loss": 0.3272, + "step": 3825 + }, + { + "epoch": 9.08801665179899, + "grad_norm": 1.7052664756774902, + "learning_rate": 9.15576694411415e-07, + "loss": 0.3346, + "step": 3826 + }, + { + "epoch": 9.090395480225988, + "grad_norm": 1.8236368894577026, + "learning_rate": 9.131985731272295e-07, + "loss": 0.3746, + "step": 3827 + }, + { + "epoch": 9.092774308652988, + "grad_norm": 1.5397491455078125, + "learning_rate": 9.108204518430442e-07, + "loss": 0.3465, + "step": 3828 + }, + { + "epoch": 9.095153137079988, + "grad_norm": 1.5676683187484741, + "learning_rate": 9.084423305588586e-07, + "loss": 0.3352, + "step": 3829 + }, + { + "epoch": 9.097531965506988, + "grad_norm": 1.494385838508606, + "learning_rate": 9.060642092746732e-07, + "loss": 0.3385, + "step": 3830 + }, + { + "epoch": 9.099910793933988, + "grad_norm": 1.9002584218978882, + "learning_rate": 9.036860879904876e-07, + "loss": 0.4566, + "step": 3831 + }, + { + "epoch": 9.102289622360987, + "grad_norm": 1.5963010787963867, + "learning_rate": 9.013079667063021e-07, + "loss": 0.2923, + "step": 3832 + }, + { + "epoch": 9.104668450787987, + "grad_norm": 1.377798318862915, + "learning_rate": 8.989298454221167e-07, + "loss": 0.2682, + "step": 3833 + }, + { + "epoch": 9.107047279214987, + "grad_norm": 1.628541350364685, + "learning_rate": 8.965517241379311e-07, + "loss": 0.3054, + "step": 3834 + }, + { + "epoch": 9.109426107641987, + "grad_norm": 1.621376872062683, + "learning_rate": 8.941736028537457e-07, + "loss": 0.3482, + "step": 3835 + }, + { + "epoch": 9.111804936068985, + "grad_norm": 1.3748974800109863, + "learning_rate": 8.917954815695601e-07, + "loss": 0.2922, + "step": 3836 + }, + { + "epoch": 9.114183764495985, + "grad_norm": 1.52541184425354, + "learning_rate": 8.894173602853746e-07, + "loss": 0.3038, + "step": 3837 + }, + { + "epoch": 9.116562592922985, + "grad_norm": 1.5701992511749268, + "learning_rate": 8.870392390011891e-07, + "loss": 0.3565, + "step": 3838 + }, + { + "epoch": 9.118941421349986, + "grad_norm": 1.4226901531219482, + "learning_rate": 8.846611177170036e-07, + "loss": 0.3256, + "step": 3839 + }, + { + "epoch": 9.121320249776986, + "grad_norm": 1.7112653255462646, + "learning_rate": 8.822829964328182e-07, + "loss": 0.3077, + "step": 3840 + }, + { + "epoch": 9.123699078203984, + "grad_norm": 1.5967851877212524, + "learning_rate": 8.799048751486326e-07, + "loss": 0.2718, + "step": 3841 + }, + { + "epoch": 9.126077906630984, + "grad_norm": 1.2504247426986694, + "learning_rate": 8.775267538644472e-07, + "loss": 0.2566, + "step": 3842 + }, + { + "epoch": 9.128456735057984, + "grad_norm": 1.4835447072982788, + "learning_rate": 8.751486325802616e-07, + "loss": 0.321, + "step": 3843 + }, + { + "epoch": 9.130835563484984, + "grad_norm": 1.496480107307434, + "learning_rate": 8.727705112960761e-07, + "loss": 0.2992, + "step": 3844 + }, + { + "epoch": 9.133214391911983, + "grad_norm": 1.6217854022979736, + "learning_rate": 8.703923900118907e-07, + "loss": 0.341, + "step": 3845 + }, + { + "epoch": 9.135593220338983, + "grad_norm": 1.5772401094436646, + "learning_rate": 8.680142687277051e-07, + "loss": 0.3079, + "step": 3846 + }, + { + "epoch": 9.137972048765983, + "grad_norm": 1.7963606119155884, + "learning_rate": 8.656361474435198e-07, + "loss": 0.329, + "step": 3847 + }, + { + "epoch": 9.140350877192983, + "grad_norm": 1.7160887718200684, + "learning_rate": 8.632580261593342e-07, + "loss": 0.3517, + "step": 3848 + }, + { + "epoch": 9.142729705619983, + "grad_norm": 1.3727798461914062, + "learning_rate": 8.608799048751488e-07, + "loss": 0.2668, + "step": 3849 + }, + { + "epoch": 9.145108534046981, + "grad_norm": 1.5238147974014282, + "learning_rate": 8.585017835909632e-07, + "loss": 0.3449, + "step": 3850 + }, + { + "epoch": 9.145108534046981, + "eval_loss": 0.42706114053726196, + "eval_runtime": 22.7904, + "eval_samples_per_second": 32.821, + "eval_steps_per_second": 16.41, + "step": 3850 + }, + { + "epoch": 9.147487362473981, + "grad_norm": 1.7320500612258911, + "learning_rate": 8.561236623067777e-07, + "loss": 0.2851, + "step": 3851 + }, + { + "epoch": 9.149866190900982, + "grad_norm": 1.2119723558425903, + "learning_rate": 8.537455410225923e-07, + "loss": 0.2684, + "step": 3852 + }, + { + "epoch": 9.152245019327982, + "grad_norm": 1.7560784816741943, + "learning_rate": 8.513674197384067e-07, + "loss": 0.3234, + "step": 3853 + }, + { + "epoch": 9.15462384775498, + "grad_norm": 1.5961847305297852, + "learning_rate": 8.489892984542213e-07, + "loss": 0.3505, + "step": 3854 + }, + { + "epoch": 9.15700267618198, + "grad_norm": 1.5974085330963135, + "learning_rate": 8.466111771700357e-07, + "loss": 0.3258, + "step": 3855 + }, + { + "epoch": 9.15938150460898, + "grad_norm": 1.782369613647461, + "learning_rate": 8.442330558858503e-07, + "loss": 0.3828, + "step": 3856 + }, + { + "epoch": 9.16176033303598, + "grad_norm": 1.6862574815750122, + "learning_rate": 8.418549346016648e-07, + "loss": 0.2963, + "step": 3857 + }, + { + "epoch": 9.16413916146298, + "grad_norm": 1.410535216331482, + "learning_rate": 8.394768133174792e-07, + "loss": 0.2523, + "step": 3858 + }, + { + "epoch": 9.166517989889979, + "grad_norm": 1.572076439857483, + "learning_rate": 8.370986920332938e-07, + "loss": 0.3165, + "step": 3859 + }, + { + "epoch": 9.168896818316979, + "grad_norm": 1.6954773664474487, + "learning_rate": 8.347205707491082e-07, + "loss": 0.3177, + "step": 3860 + }, + { + "epoch": 9.171275646743979, + "grad_norm": 1.4911867380142212, + "learning_rate": 8.323424494649228e-07, + "loss": 0.3631, + "step": 3861 + }, + { + "epoch": 9.173654475170979, + "grad_norm": 1.3255330324172974, + "learning_rate": 8.299643281807372e-07, + "loss": 0.2924, + "step": 3862 + }, + { + "epoch": 9.176033303597977, + "grad_norm": 1.6485059261322021, + "learning_rate": 8.275862068965518e-07, + "loss": 0.281, + "step": 3863 + }, + { + "epoch": 9.178412132024977, + "grad_norm": 1.7103766202926636, + "learning_rate": 8.252080856123663e-07, + "loss": 0.3713, + "step": 3864 + }, + { + "epoch": 9.180790960451978, + "grad_norm": 1.487505555152893, + "learning_rate": 8.228299643281807e-07, + "loss": 0.31, + "step": 3865 + }, + { + "epoch": 9.183169788878978, + "grad_norm": 1.8779288530349731, + "learning_rate": 8.204518430439953e-07, + "loss": 0.3396, + "step": 3866 + }, + { + "epoch": 9.185548617305976, + "grad_norm": 1.4556941986083984, + "learning_rate": 8.180737217598097e-07, + "loss": 0.3041, + "step": 3867 + }, + { + "epoch": 9.187927445732976, + "grad_norm": 1.4691352844238281, + "learning_rate": 8.156956004756244e-07, + "loss": 0.301, + "step": 3868 + }, + { + "epoch": 9.190306274159976, + "grad_norm": 1.5058563947677612, + "learning_rate": 8.133174791914389e-07, + "loss": 0.3047, + "step": 3869 + }, + { + "epoch": 9.192685102586976, + "grad_norm": 1.6128007173538208, + "learning_rate": 8.109393579072533e-07, + "loss": 0.2991, + "step": 3870 + }, + { + "epoch": 9.195063931013976, + "grad_norm": 1.6606391668319702, + "learning_rate": 8.085612366230679e-07, + "loss": 0.3316, + "step": 3871 + }, + { + "epoch": 9.197442759440975, + "grad_norm": 1.6112395524978638, + "learning_rate": 8.061831153388823e-07, + "loss": 0.3696, + "step": 3872 + }, + { + "epoch": 9.199821587867975, + "grad_norm": 1.881879210472107, + "learning_rate": 8.038049940546969e-07, + "loss": 0.3082, + "step": 3873 + }, + { + "epoch": 9.202200416294975, + "grad_norm": 1.699297547340393, + "learning_rate": 8.014268727705114e-07, + "loss": 0.3208, + "step": 3874 + }, + { + "epoch": 9.204579244721975, + "grad_norm": 1.7361236810684204, + "learning_rate": 7.990487514863259e-07, + "loss": 0.3863, + "step": 3875 + }, + { + "epoch": 9.206958073148973, + "grad_norm": 1.8119382858276367, + "learning_rate": 7.966706302021404e-07, + "loss": 0.4075, + "step": 3876 + }, + { + "epoch": 9.209336901575973, + "grad_norm": 1.3330410718917847, + "learning_rate": 7.942925089179548e-07, + "loss": 0.2945, + "step": 3877 + }, + { + "epoch": 9.211715730002974, + "grad_norm": 1.6889562606811523, + "learning_rate": 7.919143876337694e-07, + "loss": 0.3568, + "step": 3878 + }, + { + "epoch": 9.214094558429974, + "grad_norm": 1.5688480138778687, + "learning_rate": 7.895362663495838e-07, + "loss": 0.335, + "step": 3879 + }, + { + "epoch": 9.216473386856974, + "grad_norm": 1.749088168144226, + "learning_rate": 7.871581450653984e-07, + "loss": 0.367, + "step": 3880 + }, + { + "epoch": 9.218852215283972, + "grad_norm": 1.4834551811218262, + "learning_rate": 7.847800237812129e-07, + "loss": 0.3213, + "step": 3881 + }, + { + "epoch": 9.221231043710972, + "grad_norm": 1.3080354928970337, + "learning_rate": 7.824019024970274e-07, + "loss": 0.3365, + "step": 3882 + }, + { + "epoch": 9.223609872137972, + "grad_norm": 1.6398401260375977, + "learning_rate": 7.800237812128419e-07, + "loss": 0.3468, + "step": 3883 + }, + { + "epoch": 9.225988700564972, + "grad_norm": 1.7086788415908813, + "learning_rate": 7.776456599286563e-07, + "loss": 0.3257, + "step": 3884 + }, + { + "epoch": 9.22836752899197, + "grad_norm": 1.5071521997451782, + "learning_rate": 7.752675386444709e-07, + "loss": 0.3234, + "step": 3885 + }, + { + "epoch": 9.23074635741897, + "grad_norm": 1.5605233907699585, + "learning_rate": 7.728894173602854e-07, + "loss": 0.2757, + "step": 3886 + }, + { + "epoch": 9.23312518584597, + "grad_norm": 1.7485469579696655, + "learning_rate": 7.705112960761e-07, + "loss": 0.3563, + "step": 3887 + }, + { + "epoch": 9.235504014272971, + "grad_norm": 1.5103603601455688, + "learning_rate": 7.681331747919145e-07, + "loss": 0.2505, + "step": 3888 + }, + { + "epoch": 9.237882842699971, + "grad_norm": 1.4344325065612793, + "learning_rate": 7.65755053507729e-07, + "loss": 0.3037, + "step": 3889 + }, + { + "epoch": 9.24026167112697, + "grad_norm": 1.4007701873779297, + "learning_rate": 7.633769322235435e-07, + "loss": 0.2596, + "step": 3890 + }, + { + "epoch": 9.24264049955397, + "grad_norm": 1.671839952468872, + "learning_rate": 7.609988109393579e-07, + "loss": 0.4016, + "step": 3891 + }, + { + "epoch": 9.24501932798097, + "grad_norm": 1.3365862369537354, + "learning_rate": 7.586206896551725e-07, + "loss": 0.2821, + "step": 3892 + }, + { + "epoch": 9.24739815640797, + "grad_norm": 1.3503822088241577, + "learning_rate": 7.56242568370987e-07, + "loss": 0.2992, + "step": 3893 + }, + { + "epoch": 9.249776984834968, + "grad_norm": 1.6727803945541382, + "learning_rate": 7.538644470868015e-07, + "loss": 0.3424, + "step": 3894 + }, + { + "epoch": 9.252155813261968, + "grad_norm": 1.8679476976394653, + "learning_rate": 7.51486325802616e-07, + "loss": 0.383, + "step": 3895 + }, + { + "epoch": 9.254534641688968, + "grad_norm": 1.6646640300750732, + "learning_rate": 7.491082045184304e-07, + "loss": 0.3458, + "step": 3896 + }, + { + "epoch": 9.256913470115968, + "grad_norm": 2.0038647651672363, + "learning_rate": 7.46730083234245e-07, + "loss": 0.3319, + "step": 3897 + }, + { + "epoch": 9.259292298542968, + "grad_norm": 1.5993553400039673, + "learning_rate": 7.443519619500596e-07, + "loss": 0.3109, + "step": 3898 + }, + { + "epoch": 9.261671126969967, + "grad_norm": 1.4248559474945068, + "learning_rate": 7.41973840665874e-07, + "loss": 0.2824, + "step": 3899 + }, + { + "epoch": 9.264049955396967, + "grad_norm": 1.5825339555740356, + "learning_rate": 7.395957193816885e-07, + "loss": 0.3867, + "step": 3900 + }, + { + "epoch": 9.264049955396967, + "eval_loss": 0.4266693592071533, + "eval_runtime": 22.6715, + "eval_samples_per_second": 32.993, + "eval_steps_per_second": 16.496, + "step": 3900 + }, + { + "epoch": 9.266428783823967, + "grad_norm": 1.44635808467865, + "learning_rate": 7.37217598097503e-07, + "loss": 0.259, + "step": 3901 + }, + { + "epoch": 9.268807612250967, + "grad_norm": 1.571354866027832, + "learning_rate": 7.348394768133175e-07, + "loss": 0.3452, + "step": 3902 + }, + { + "epoch": 9.271186440677965, + "grad_norm": 1.6857898235321045, + "learning_rate": 7.324613555291321e-07, + "loss": 0.3211, + "step": 3903 + }, + { + "epoch": 9.273565269104965, + "grad_norm": 1.8715864419937134, + "learning_rate": 7.300832342449465e-07, + "loss": 0.3833, + "step": 3904 + }, + { + "epoch": 9.275944097531966, + "grad_norm": 1.4579352140426636, + "learning_rate": 7.277051129607611e-07, + "loss": 0.313, + "step": 3905 + }, + { + "epoch": 9.278322925958966, + "grad_norm": 1.595421314239502, + "learning_rate": 7.253269916765755e-07, + "loss": 0.2407, + "step": 3906 + }, + { + "epoch": 9.280701754385966, + "grad_norm": 1.5615202188491821, + "learning_rate": 7.229488703923901e-07, + "loss": 0.3423, + "step": 3907 + }, + { + "epoch": 9.283080582812964, + "grad_norm": 1.7300091981887817, + "learning_rate": 7.205707491082046e-07, + "loss": 0.3055, + "step": 3908 + }, + { + "epoch": 9.285459411239964, + "grad_norm": 1.8103034496307373, + "learning_rate": 7.181926278240191e-07, + "loss": 0.3438, + "step": 3909 + }, + { + "epoch": 9.287838239666964, + "grad_norm": 1.7838460206985474, + "learning_rate": 7.158145065398337e-07, + "loss": 0.3555, + "step": 3910 + }, + { + "epoch": 9.290217068093964, + "grad_norm": 1.6418458223342896, + "learning_rate": 7.134363852556481e-07, + "loss": 0.296, + "step": 3911 + }, + { + "epoch": 9.292595896520963, + "grad_norm": 1.644800066947937, + "learning_rate": 7.110582639714626e-07, + "loss": 0.3226, + "step": 3912 + }, + { + "epoch": 9.294974724947963, + "grad_norm": 1.5047404766082764, + "learning_rate": 7.086801426872771e-07, + "loss": 0.2773, + "step": 3913 + }, + { + "epoch": 9.297353553374963, + "grad_norm": 1.6783963441848755, + "learning_rate": 7.063020214030916e-07, + "loss": 0.3183, + "step": 3914 + }, + { + "epoch": 9.299732381801963, + "grad_norm": 1.5718568563461304, + "learning_rate": 7.039239001189062e-07, + "loss": 0.3058, + "step": 3915 + }, + { + "epoch": 9.302111210228961, + "grad_norm": 1.7384310960769653, + "learning_rate": 7.015457788347206e-07, + "loss": 0.3174, + "step": 3916 + }, + { + "epoch": 9.304490038655961, + "grad_norm": 1.488249659538269, + "learning_rate": 6.991676575505352e-07, + "loss": 0.3129, + "step": 3917 + }, + { + "epoch": 9.306868867082962, + "grad_norm": 1.5504875183105469, + "learning_rate": 6.967895362663496e-07, + "loss": 0.2989, + "step": 3918 + }, + { + "epoch": 9.309247695509962, + "grad_norm": 1.7251152992248535, + "learning_rate": 6.944114149821641e-07, + "loss": 0.2643, + "step": 3919 + }, + { + "epoch": 9.311626523936962, + "grad_norm": 1.542085886001587, + "learning_rate": 6.920332936979786e-07, + "loss": 0.3509, + "step": 3920 + }, + { + "epoch": 9.31400535236396, + "grad_norm": 1.547103762626648, + "learning_rate": 6.896551724137931e-07, + "loss": 0.3001, + "step": 3921 + }, + { + "epoch": 9.31638418079096, + "grad_norm": 1.4196369647979736, + "learning_rate": 6.872770511296077e-07, + "loss": 0.3172, + "step": 3922 + }, + { + "epoch": 9.31876300921796, + "grad_norm": 1.5661107301712036, + "learning_rate": 6.848989298454221e-07, + "loss": 0.312, + "step": 3923 + }, + { + "epoch": 9.32114183764496, + "grad_norm": 1.593098521232605, + "learning_rate": 6.825208085612367e-07, + "loss": 0.3306, + "step": 3924 + }, + { + "epoch": 9.323520666071959, + "grad_norm": 1.8822636604309082, + "learning_rate": 6.801426872770511e-07, + "loss": 0.3134, + "step": 3925 + }, + { + "epoch": 9.325899494498959, + "grad_norm": 1.638013482093811, + "learning_rate": 6.777645659928656e-07, + "loss": 0.3433, + "step": 3926 + }, + { + "epoch": 9.328278322925959, + "grad_norm": 1.6825361251831055, + "learning_rate": 6.753864447086803e-07, + "loss": 0.3403, + "step": 3927 + }, + { + "epoch": 9.330657151352959, + "grad_norm": 1.4327155351638794, + "learning_rate": 6.730083234244947e-07, + "loss": 0.2477, + "step": 3928 + }, + { + "epoch": 9.33303597977996, + "grad_norm": 1.6838940382003784, + "learning_rate": 6.706302021403093e-07, + "loss": 0.3468, + "step": 3929 + }, + { + "epoch": 9.335414808206957, + "grad_norm": 1.5850017070770264, + "learning_rate": 6.682520808561237e-07, + "loss": 0.3522, + "step": 3930 + }, + { + "epoch": 9.337793636633958, + "grad_norm": 1.475049376487732, + "learning_rate": 6.658739595719383e-07, + "loss": 0.3439, + "step": 3931 + }, + { + "epoch": 9.340172465060958, + "grad_norm": 1.5649096965789795, + "learning_rate": 6.634958382877528e-07, + "loss": 0.3557, + "step": 3932 + }, + { + "epoch": 9.342551293487958, + "grad_norm": 1.7286350727081299, + "learning_rate": 6.611177170035672e-07, + "loss": 0.3194, + "step": 3933 + }, + { + "epoch": 9.344930121914956, + "grad_norm": 1.3923090696334839, + "learning_rate": 6.587395957193818e-07, + "loss": 0.2837, + "step": 3934 + }, + { + "epoch": 9.347308950341956, + "grad_norm": 1.6163533926010132, + "learning_rate": 6.563614744351962e-07, + "loss": 0.3755, + "step": 3935 + }, + { + "epoch": 9.349687778768956, + "grad_norm": 1.5562987327575684, + "learning_rate": 6.539833531510108e-07, + "loss": 0.3104, + "step": 3936 + }, + { + "epoch": 9.352066607195956, + "grad_norm": 1.7122493982315063, + "learning_rate": 6.516052318668252e-07, + "loss": 0.408, + "step": 3937 + }, + { + "epoch": 9.354445435622956, + "grad_norm": 1.8444535732269287, + "learning_rate": 6.492271105826398e-07, + "loss": 0.4153, + "step": 3938 + }, + { + "epoch": 9.356824264049955, + "grad_norm": 1.5348180532455444, + "learning_rate": 6.468489892984543e-07, + "loss": 0.3213, + "step": 3939 + }, + { + "epoch": 9.359203092476955, + "grad_norm": 1.6209564208984375, + "learning_rate": 6.444708680142687e-07, + "loss": 0.333, + "step": 3940 + }, + { + "epoch": 9.361581920903955, + "grad_norm": 1.5803405046463013, + "learning_rate": 6.420927467300833e-07, + "loss": 0.2944, + "step": 3941 + }, + { + "epoch": 9.363960749330955, + "grad_norm": 1.5471481084823608, + "learning_rate": 6.397146254458977e-07, + "loss": 0.3504, + "step": 3942 + }, + { + "epoch": 9.366339577757953, + "grad_norm": 1.784106731414795, + "learning_rate": 6.373365041617123e-07, + "loss": 0.4126, + "step": 3943 + }, + { + "epoch": 9.368718406184954, + "grad_norm": 1.5933451652526855, + "learning_rate": 6.349583828775268e-07, + "loss": 0.3495, + "step": 3944 + }, + { + "epoch": 9.371097234611954, + "grad_norm": 1.7055026292800903, + "learning_rate": 6.325802615933412e-07, + "loss": 0.3218, + "step": 3945 + }, + { + "epoch": 9.373476063038954, + "grad_norm": 1.5506031513214111, + "learning_rate": 6.302021403091559e-07, + "loss": 0.3127, + "step": 3946 + }, + { + "epoch": 9.375854891465954, + "grad_norm": 1.68362295627594, + "learning_rate": 6.278240190249703e-07, + "loss": 0.3061, + "step": 3947 + }, + { + "epoch": 9.378233719892952, + "grad_norm": 1.6470222473144531, + "learning_rate": 6.254458977407849e-07, + "loss": 0.3476, + "step": 3948 + }, + { + "epoch": 9.380612548319952, + "grad_norm": 1.3188354969024658, + "learning_rate": 6.230677764565994e-07, + "loss": 0.246, + "step": 3949 + }, + { + "epoch": 9.382991376746952, + "grad_norm": 1.6917141675949097, + "learning_rate": 6.206896551724139e-07, + "loss": 0.3549, + "step": 3950 + }, + { + "epoch": 9.382991376746952, + "eval_loss": 0.42687931656837463, + "eval_runtime": 22.9851, + "eval_samples_per_second": 32.543, + "eval_steps_per_second": 16.271, + "step": 3950 + }, + { + "epoch": 9.385370205173952, + "grad_norm": 1.4022612571716309, + "learning_rate": 6.183115338882284e-07, + "loss": 0.2813, + "step": 3951 + }, + { + "epoch": 9.38774903360095, + "grad_norm": 1.6139168739318848, + "learning_rate": 6.159334126040428e-07, + "loss": 0.3366, + "step": 3952 + }, + { + "epoch": 9.390127862027951, + "grad_norm": 1.5843446254730225, + "learning_rate": 6.135552913198574e-07, + "loss": 0.2849, + "step": 3953 + }, + { + "epoch": 9.392506690454951, + "grad_norm": 1.5010361671447754, + "learning_rate": 6.111771700356719e-07, + "loss": 0.3251, + "step": 3954 + }, + { + "epoch": 9.394885518881951, + "grad_norm": 1.552485466003418, + "learning_rate": 6.087990487514864e-07, + "loss": 0.3523, + "step": 3955 + }, + { + "epoch": 9.397264347308951, + "grad_norm": 1.5776777267456055, + "learning_rate": 6.064209274673009e-07, + "loss": 0.2864, + "step": 3956 + }, + { + "epoch": 9.39964317573595, + "grad_norm": 1.8439611196517944, + "learning_rate": 6.040428061831154e-07, + "loss": 0.3383, + "step": 3957 + }, + { + "epoch": 9.40202200416295, + "grad_norm": 1.5968550443649292, + "learning_rate": 6.016646848989298e-07, + "loss": 0.3214, + "step": 3958 + }, + { + "epoch": 9.40440083258995, + "grad_norm": 1.477336049079895, + "learning_rate": 5.992865636147444e-07, + "loss": 0.2755, + "step": 3959 + }, + { + "epoch": 9.40677966101695, + "grad_norm": 1.8656772375106812, + "learning_rate": 5.969084423305589e-07, + "loss": 0.3563, + "step": 3960 + }, + { + "epoch": 9.409158489443948, + "grad_norm": 1.4835832118988037, + "learning_rate": 5.945303210463734e-07, + "loss": 0.2829, + "step": 3961 + }, + { + "epoch": 9.411537317870948, + "grad_norm": 1.6682826280593872, + "learning_rate": 5.921521997621879e-07, + "loss": 0.4067, + "step": 3962 + }, + { + "epoch": 9.413916146297948, + "grad_norm": 1.6261913776397705, + "learning_rate": 5.897740784780024e-07, + "loss": 0.3275, + "step": 3963 + }, + { + "epoch": 9.416294974724948, + "grad_norm": 1.6861552000045776, + "learning_rate": 5.873959571938169e-07, + "loss": 0.3942, + "step": 3964 + }, + { + "epoch": 9.418673803151947, + "grad_norm": 1.5152397155761719, + "learning_rate": 5.850178359096314e-07, + "loss": 0.2936, + "step": 3965 + }, + { + "epoch": 9.421052631578947, + "grad_norm": 1.6734514236450195, + "learning_rate": 5.82639714625446e-07, + "loss": 0.3142, + "step": 3966 + }, + { + "epoch": 9.423431460005947, + "grad_norm": 1.7036266326904297, + "learning_rate": 5.802615933412605e-07, + "loss": 0.3251, + "step": 3967 + }, + { + "epoch": 9.425810288432947, + "grad_norm": 1.5996062755584717, + "learning_rate": 5.77883472057075e-07, + "loss": 0.4389, + "step": 3968 + }, + { + "epoch": 9.428189116859947, + "grad_norm": 1.592655062675476, + "learning_rate": 5.755053507728895e-07, + "loss": 0.3445, + "step": 3969 + }, + { + "epoch": 9.430567945286946, + "grad_norm": 1.538851022720337, + "learning_rate": 5.73127229488704e-07, + "loss": 0.3266, + "step": 3970 + }, + { + "epoch": 9.432946773713946, + "grad_norm": 1.6751470565795898, + "learning_rate": 5.707491082045185e-07, + "loss": 0.3776, + "step": 3971 + }, + { + "epoch": 9.435325602140946, + "grad_norm": 1.4246760606765747, + "learning_rate": 5.68370986920333e-07, + "loss": 0.3036, + "step": 3972 + }, + { + "epoch": 9.437704430567946, + "grad_norm": 1.6333016157150269, + "learning_rate": 5.659928656361475e-07, + "loss": 0.3436, + "step": 3973 + }, + { + "epoch": 9.440083258994944, + "grad_norm": 1.5650068521499634, + "learning_rate": 5.63614744351962e-07, + "loss": 0.3447, + "step": 3974 + }, + { + "epoch": 9.442462087421944, + "grad_norm": 1.6601653099060059, + "learning_rate": 5.612366230677765e-07, + "loss": 0.3139, + "step": 3975 + }, + { + "epoch": 9.444840915848944, + "grad_norm": 1.2898657321929932, + "learning_rate": 5.58858501783591e-07, + "loss": 0.3231, + "step": 3976 + }, + { + "epoch": 9.447219744275944, + "grad_norm": 1.5274943113327026, + "learning_rate": 5.564803804994055e-07, + "loss": 0.3359, + "step": 3977 + }, + { + "epoch": 9.449598572702945, + "grad_norm": 1.5498411655426025, + "learning_rate": 5.5410225921522e-07, + "loss": 0.3358, + "step": 3978 + }, + { + "epoch": 9.451977401129943, + "grad_norm": 1.6425604820251465, + "learning_rate": 5.517241379310345e-07, + "loss": 0.3143, + "step": 3979 + }, + { + "epoch": 9.454356229556943, + "grad_norm": 1.4356712102890015, + "learning_rate": 5.49346016646849e-07, + "loss": 0.3513, + "step": 3980 + }, + { + "epoch": 9.456735057983943, + "grad_norm": 1.5949715375900269, + "learning_rate": 5.469678953626635e-07, + "loss": 0.3319, + "step": 3981 + }, + { + "epoch": 9.459113886410943, + "grad_norm": 1.5230776071548462, + "learning_rate": 5.44589774078478e-07, + "loss": 0.3566, + "step": 3982 + }, + { + "epoch": 9.461492714837942, + "grad_norm": 1.4821454286575317, + "learning_rate": 5.422116527942925e-07, + "loss": 0.3022, + "step": 3983 + }, + { + "epoch": 9.463871543264942, + "grad_norm": 1.396803379058838, + "learning_rate": 5.39833531510107e-07, + "loss": 0.3144, + "step": 3984 + }, + { + "epoch": 9.466250371691942, + "grad_norm": 1.4549235105514526, + "learning_rate": 5.374554102259215e-07, + "loss": 0.2823, + "step": 3985 + }, + { + "epoch": 9.468629200118942, + "grad_norm": 1.4758179187774658, + "learning_rate": 5.350772889417361e-07, + "loss": 0.2906, + "step": 3986 + }, + { + "epoch": 9.471008028545942, + "grad_norm": 1.7131702899932861, + "learning_rate": 5.326991676575506e-07, + "loss": 0.3492, + "step": 3987 + }, + { + "epoch": 9.47338685697294, + "grad_norm": 1.6978141069412231, + "learning_rate": 5.303210463733651e-07, + "loss": 0.3589, + "step": 3988 + }, + { + "epoch": 9.47576568539994, + "grad_norm": 1.4608232975006104, + "learning_rate": 5.279429250891796e-07, + "loss": 0.3045, + "step": 3989 + }, + { + "epoch": 9.47814451382694, + "grad_norm": 1.665734052658081, + "learning_rate": 5.255648038049941e-07, + "loss": 0.3699, + "step": 3990 + }, + { + "epoch": 9.48052334225394, + "grad_norm": 1.7299957275390625, + "learning_rate": 5.231866825208086e-07, + "loss": 0.3143, + "step": 3991 + }, + { + "epoch": 9.482902170680939, + "grad_norm": 1.3991278409957886, + "learning_rate": 5.208085612366231e-07, + "loss": 0.3061, + "step": 3992 + }, + { + "epoch": 9.485280999107939, + "grad_norm": 1.549603819847107, + "learning_rate": 5.184304399524377e-07, + "loss": 0.3138, + "step": 3993 + }, + { + "epoch": 9.487659827534939, + "grad_norm": 1.5384782552719116, + "learning_rate": 5.160523186682521e-07, + "loss": 0.3487, + "step": 3994 + }, + { + "epoch": 9.49003865596194, + "grad_norm": 1.5312919616699219, + "learning_rate": 5.136741973840666e-07, + "loss": 0.2577, + "step": 3995 + }, + { + "epoch": 9.49241748438894, + "grad_norm": 1.4340932369232178, + "learning_rate": 5.112960760998811e-07, + "loss": 0.3239, + "step": 3996 + }, + { + "epoch": 9.494796312815938, + "grad_norm": 1.4212329387664795, + "learning_rate": 5.089179548156956e-07, + "loss": 0.2827, + "step": 3997 + }, + { + "epoch": 9.497175141242938, + "grad_norm": 1.716859221458435, + "learning_rate": 5.065398335315101e-07, + "loss": 0.3069, + "step": 3998 + }, + { + "epoch": 9.499553969669938, + "grad_norm": 1.6228601932525635, + "learning_rate": 5.041617122473247e-07, + "loss": 0.2949, + "step": 3999 + }, + { + "epoch": 9.501932798096938, + "grad_norm": 1.4377381801605225, + "learning_rate": 5.017835909631392e-07, + "loss": 0.2963, + "step": 4000 + }, + { + "epoch": 9.501932798096938, + "eval_loss": 0.42700162529945374, + "eval_runtime": 22.5575, + "eval_samples_per_second": 33.16, + "eval_steps_per_second": 16.58, + "step": 4000 + }, + { + "epoch": 9.504311626523936, + "grad_norm": 1.6318362951278687, + "learning_rate": 4.994054696789536e-07, + "loss": 0.269, + "step": 4001 + }, + { + "epoch": 9.506690454950936, + "grad_norm": 1.4662967920303345, + "learning_rate": 4.970273483947681e-07, + "loss": 0.2896, + "step": 4002 + }, + { + "epoch": 9.509069283377936, + "grad_norm": 1.5030030012130737, + "learning_rate": 4.946492271105826e-07, + "loss": 0.3227, + "step": 4003 + }, + { + "epoch": 9.511448111804937, + "grad_norm": 1.5962183475494385, + "learning_rate": 4.922711058263971e-07, + "loss": 0.3056, + "step": 4004 + }, + { + "epoch": 9.513826940231937, + "grad_norm": 1.5691380500793457, + "learning_rate": 4.898929845422118e-07, + "loss": 0.3024, + "step": 4005 + }, + { + "epoch": 9.516205768658935, + "grad_norm": 1.4072649478912354, + "learning_rate": 4.875148632580263e-07, + "loss": 0.3537, + "step": 4006 + }, + { + "epoch": 9.518584597085935, + "grad_norm": 1.968650460243225, + "learning_rate": 4.851367419738407e-07, + "loss": 0.4415, + "step": 4007 + }, + { + "epoch": 9.520963425512935, + "grad_norm": 1.568570613861084, + "learning_rate": 4.827586206896552e-07, + "loss": 0.308, + "step": 4008 + }, + { + "epoch": 9.523342253939935, + "grad_norm": 1.7893601655960083, + "learning_rate": 4.803804994054697e-07, + "loss": 0.3456, + "step": 4009 + }, + { + "epoch": 9.525721082366934, + "grad_norm": 1.6020379066467285, + "learning_rate": 4.780023781212842e-07, + "loss": 0.3148, + "step": 4010 + }, + { + "epoch": 9.528099910793934, + "grad_norm": 1.5546603202819824, + "learning_rate": 4.7562425683709873e-07, + "loss": 0.2685, + "step": 4011 + }, + { + "epoch": 9.530478739220934, + "grad_norm": 1.652596354484558, + "learning_rate": 4.7324613555291324e-07, + "loss": 0.3664, + "step": 4012 + }, + { + "epoch": 9.532857567647934, + "grad_norm": 1.729309320449829, + "learning_rate": 4.7086801426872774e-07, + "loss": 0.3297, + "step": 4013 + }, + { + "epoch": 9.535236396074932, + "grad_norm": 1.6772130727767944, + "learning_rate": 4.6848989298454225e-07, + "loss": 0.3256, + "step": 4014 + }, + { + "epoch": 9.537615224501932, + "grad_norm": 1.5327156782150269, + "learning_rate": 4.6611177170035675e-07, + "loss": 0.309, + "step": 4015 + }, + { + "epoch": 9.539994052928932, + "grad_norm": 1.5528484582901, + "learning_rate": 4.637336504161712e-07, + "loss": 0.3132, + "step": 4016 + }, + { + "epoch": 9.542372881355933, + "grad_norm": 1.7993724346160889, + "learning_rate": 4.613555291319858e-07, + "loss": 0.3321, + "step": 4017 + }, + { + "epoch": 9.544751709782933, + "grad_norm": 1.6126142740249634, + "learning_rate": 4.5897740784780027e-07, + "loss": 0.3063, + "step": 4018 + }, + { + "epoch": 9.547130538209931, + "grad_norm": 1.623367428779602, + "learning_rate": 4.5659928656361477e-07, + "loss": 0.285, + "step": 4019 + }, + { + "epoch": 9.549509366636931, + "grad_norm": 1.4000245332717896, + "learning_rate": 4.542211652794293e-07, + "loss": 0.2963, + "step": 4020 + }, + { + "epoch": 9.551888195063931, + "grad_norm": 1.7888904809951782, + "learning_rate": 4.518430439952438e-07, + "loss": 0.3582, + "step": 4021 + }, + { + "epoch": 9.554267023490931, + "grad_norm": 1.7076871395111084, + "learning_rate": 4.4946492271105834e-07, + "loss": 0.2442, + "step": 4022 + }, + { + "epoch": 9.55664585191793, + "grad_norm": 1.6720407009124756, + "learning_rate": 4.4708680142687285e-07, + "loss": 0.3989, + "step": 4023 + }, + { + "epoch": 9.55902468034493, + "grad_norm": 1.9011660814285278, + "learning_rate": 4.447086801426873e-07, + "loss": 0.294, + "step": 4024 + }, + { + "epoch": 9.56140350877193, + "grad_norm": 1.4829168319702148, + "learning_rate": 4.423305588585018e-07, + "loss": 0.2879, + "step": 4025 + }, + { + "epoch": 9.56378233719893, + "grad_norm": 1.6958966255187988, + "learning_rate": 4.399524375743163e-07, + "loss": 0.3064, + "step": 4026 + }, + { + "epoch": 9.56616116562593, + "grad_norm": 1.4572391510009766, + "learning_rate": 4.375743162901308e-07, + "loss": 0.3033, + "step": 4027 + }, + { + "epoch": 9.568539994052928, + "grad_norm": 1.6414463520050049, + "learning_rate": 4.3519619500594537e-07, + "loss": 0.3102, + "step": 4028 + }, + { + "epoch": 9.570918822479928, + "grad_norm": 1.7459945678710938, + "learning_rate": 4.328180737217599e-07, + "loss": 0.2946, + "step": 4029 + }, + { + "epoch": 9.573297650906929, + "grad_norm": 1.5155504941940308, + "learning_rate": 4.304399524375744e-07, + "loss": 0.3002, + "step": 4030 + }, + { + "epoch": 9.575676479333929, + "grad_norm": 1.3077186346054077, + "learning_rate": 4.2806183115338883e-07, + "loss": 0.2257, + "step": 4031 + }, + { + "epoch": 9.578055307760927, + "grad_norm": 1.5316224098205566, + "learning_rate": 4.2568370986920334e-07, + "loss": 0.3334, + "step": 4032 + }, + { + "epoch": 9.580434136187927, + "grad_norm": 1.652294635772705, + "learning_rate": 4.2330558858501784e-07, + "loss": 0.2793, + "step": 4033 + }, + { + "epoch": 9.582812964614927, + "grad_norm": 1.5119520425796509, + "learning_rate": 4.209274673008324e-07, + "loss": 0.2918, + "step": 4034 + }, + { + "epoch": 9.585191793041927, + "grad_norm": 1.5302040576934814, + "learning_rate": 4.185493460166469e-07, + "loss": 0.266, + "step": 4035 + }, + { + "epoch": 9.587570621468927, + "grad_norm": 1.5043840408325195, + "learning_rate": 4.161712247324614e-07, + "loss": 0.3078, + "step": 4036 + }, + { + "epoch": 9.589949449895926, + "grad_norm": 1.742556095123291, + "learning_rate": 4.137931034482759e-07, + "loss": 0.3276, + "step": 4037 + }, + { + "epoch": 9.592328278322926, + "grad_norm": 1.4804481267929077, + "learning_rate": 4.1141498216409037e-07, + "loss": 0.3298, + "step": 4038 + }, + { + "epoch": 9.594707106749926, + "grad_norm": 1.7015149593353271, + "learning_rate": 4.0903686087990487e-07, + "loss": 0.3292, + "step": 4039 + }, + { + "epoch": 9.597085935176926, + "grad_norm": 1.445074439048767, + "learning_rate": 4.0665873959571943e-07, + "loss": 0.2693, + "step": 4040 + }, + { + "epoch": 9.599464763603924, + "grad_norm": 1.7358458042144775, + "learning_rate": 4.0428061831153394e-07, + "loss": 0.3537, + "step": 4041 + }, + { + "epoch": 9.601843592030924, + "grad_norm": 1.6926034688949585, + "learning_rate": 4.0190249702734844e-07, + "loss": 0.4005, + "step": 4042 + }, + { + "epoch": 9.604222420457925, + "grad_norm": 1.8589072227478027, + "learning_rate": 3.9952437574316295e-07, + "loss": 0.3114, + "step": 4043 + }, + { + "epoch": 9.606601248884925, + "grad_norm": 1.57071852684021, + "learning_rate": 3.971462544589774e-07, + "loss": 0.2516, + "step": 4044 + }, + { + "epoch": 9.608980077311923, + "grad_norm": 1.557504415512085, + "learning_rate": 3.947681331747919e-07, + "loss": 0.2973, + "step": 4045 + }, + { + "epoch": 9.611358905738923, + "grad_norm": 1.6545915603637695, + "learning_rate": 3.9239001189060646e-07, + "loss": 0.333, + "step": 4046 + }, + { + "epoch": 9.613737734165923, + "grad_norm": 1.599575161933899, + "learning_rate": 3.9001189060642097e-07, + "loss": 0.4135, + "step": 4047 + }, + { + "epoch": 9.616116562592923, + "grad_norm": 1.5218342542648315, + "learning_rate": 3.8763376932223547e-07, + "loss": 0.3691, + "step": 4048 + }, + { + "epoch": 9.618495391019923, + "grad_norm": 1.8146811723709106, + "learning_rate": 3.8525564803805e-07, + "loss": 0.3626, + "step": 4049 + }, + { + "epoch": 9.620874219446922, + "grad_norm": 1.5476871728897095, + "learning_rate": 3.828775267538645e-07, + "loss": 0.3216, + "step": 4050 + }, + { + "epoch": 9.620874219446922, + "eval_loss": 0.4277096390724182, + "eval_runtime": 22.6161, + "eval_samples_per_second": 33.074, + "eval_steps_per_second": 16.537, + "step": 4050 + }, + { + "epoch": 9.623253047873922, + "grad_norm": 1.9093314409255981, + "learning_rate": 3.8049940546967893e-07, + "loss": 0.3314, + "step": 4051 + }, + { + "epoch": 9.625631876300922, + "grad_norm": 1.7769403457641602, + "learning_rate": 3.781212841854935e-07, + "loss": 0.3639, + "step": 4052 + }, + { + "epoch": 9.628010704727922, + "grad_norm": 1.5246273279190063, + "learning_rate": 3.75743162901308e-07, + "loss": 0.3015, + "step": 4053 + }, + { + "epoch": 9.630389533154922, + "grad_norm": 1.6685805320739746, + "learning_rate": 3.733650416171225e-07, + "loss": 0.277, + "step": 4054 + }, + { + "epoch": 9.63276836158192, + "grad_norm": 1.4544271230697632, + "learning_rate": 3.70986920332937e-07, + "loss": 0.272, + "step": 4055 + }, + { + "epoch": 9.63514719000892, + "grad_norm": 1.7431800365447998, + "learning_rate": 3.686087990487515e-07, + "loss": 0.3178, + "step": 4056 + }, + { + "epoch": 9.63752601843592, + "grad_norm": 1.7102618217468262, + "learning_rate": 3.6623067776456607e-07, + "loss": 0.3282, + "step": 4057 + }, + { + "epoch": 9.63990484686292, + "grad_norm": 1.7288769483566284, + "learning_rate": 3.638525564803806e-07, + "loss": 0.3375, + "step": 4058 + }, + { + "epoch": 9.642283675289919, + "grad_norm": 1.7365422248840332, + "learning_rate": 3.6147443519619503e-07, + "loss": 0.3391, + "step": 4059 + }, + { + "epoch": 9.64466250371692, + "grad_norm": 1.4502772092819214, + "learning_rate": 3.5909631391200953e-07, + "loss": 0.2795, + "step": 4060 + }, + { + "epoch": 9.64704133214392, + "grad_norm": 1.562796950340271, + "learning_rate": 3.5671819262782404e-07, + "loss": 0.363, + "step": 4061 + }, + { + "epoch": 9.64942016057092, + "grad_norm": 1.5762161016464233, + "learning_rate": 3.5434007134363854e-07, + "loss": 0.3372, + "step": 4062 + }, + { + "epoch": 9.651798988997918, + "grad_norm": 1.8893486261367798, + "learning_rate": 3.519619500594531e-07, + "loss": 0.3527, + "step": 4063 + }, + { + "epoch": 9.654177817424918, + "grad_norm": 1.5574511289596558, + "learning_rate": 3.495838287752676e-07, + "loss": 0.2763, + "step": 4064 + }, + { + "epoch": 9.656556645851918, + "grad_norm": 1.477138638496399, + "learning_rate": 3.4720570749108206e-07, + "loss": 0.2802, + "step": 4065 + }, + { + "epoch": 9.658935474278918, + "grad_norm": 1.4103556871414185, + "learning_rate": 3.4482758620689656e-07, + "loss": 0.3059, + "step": 4066 + }, + { + "epoch": 9.661314302705918, + "grad_norm": 1.588335394859314, + "learning_rate": 3.4244946492271107e-07, + "loss": 0.299, + "step": 4067 + }, + { + "epoch": 9.663693131132916, + "grad_norm": 1.781563401222229, + "learning_rate": 3.4007134363852557e-07, + "loss": 0.3578, + "step": 4068 + }, + { + "epoch": 9.666071959559916, + "grad_norm": 1.3974297046661377, + "learning_rate": 3.3769322235434013e-07, + "loss": 0.2612, + "step": 4069 + }, + { + "epoch": 9.668450787986917, + "grad_norm": 1.606660008430481, + "learning_rate": 3.3531510107015463e-07, + "loss": 0.3818, + "step": 4070 + }, + { + "epoch": 9.670829616413917, + "grad_norm": 1.4120056629180908, + "learning_rate": 3.3293697978596914e-07, + "loss": 0.2484, + "step": 4071 + }, + { + "epoch": 9.673208444840915, + "grad_norm": 1.630441665649414, + "learning_rate": 3.305588585017836e-07, + "loss": 0.309, + "step": 4072 + }, + { + "epoch": 9.675587273267915, + "grad_norm": 1.6048080921173096, + "learning_rate": 3.281807372175981e-07, + "loss": 0.3062, + "step": 4073 + }, + { + "epoch": 9.677966101694915, + "grad_norm": 1.538051962852478, + "learning_rate": 3.258026159334126e-07, + "loss": 0.3816, + "step": 4074 + }, + { + "epoch": 9.680344930121915, + "grad_norm": 1.7660027742385864, + "learning_rate": 3.2342449464922716e-07, + "loss": 0.335, + "step": 4075 + }, + { + "epoch": 9.682723758548915, + "grad_norm": 1.762683391571045, + "learning_rate": 3.2104637336504166e-07, + "loss": 0.381, + "step": 4076 + }, + { + "epoch": 9.685102586975914, + "grad_norm": 1.4295339584350586, + "learning_rate": 3.1866825208085617e-07, + "loss": 0.3534, + "step": 4077 + }, + { + "epoch": 9.687481415402914, + "grad_norm": 1.513542652130127, + "learning_rate": 3.162901307966706e-07, + "loss": 0.3323, + "step": 4078 + }, + { + "epoch": 9.689860243829914, + "grad_norm": 1.4512635469436646, + "learning_rate": 3.1391200951248513e-07, + "loss": 0.2558, + "step": 4079 + }, + { + "epoch": 9.692239072256914, + "grad_norm": 1.6210118532180786, + "learning_rate": 3.115338882282997e-07, + "loss": 0.3171, + "step": 4080 + }, + { + "epoch": 9.694617900683912, + "grad_norm": 1.8660438060760498, + "learning_rate": 3.091557669441142e-07, + "loss": 0.3682, + "step": 4081 + }, + { + "epoch": 9.696996729110912, + "grad_norm": 1.6890968084335327, + "learning_rate": 3.067776456599287e-07, + "loss": 0.3675, + "step": 4082 + }, + { + "epoch": 9.699375557537913, + "grad_norm": 1.5015740394592285, + "learning_rate": 3.043995243757432e-07, + "loss": 0.3158, + "step": 4083 + }, + { + "epoch": 9.701754385964913, + "grad_norm": 1.6050732135772705, + "learning_rate": 3.020214030915577e-07, + "loss": 0.2677, + "step": 4084 + }, + { + "epoch": 9.704133214391913, + "grad_norm": 1.4498337507247925, + "learning_rate": 2.996432818073722e-07, + "loss": 0.2623, + "step": 4085 + }, + { + "epoch": 9.706512042818911, + "grad_norm": 1.8261195421218872, + "learning_rate": 2.972651605231867e-07, + "loss": 0.324, + "step": 4086 + }, + { + "epoch": 9.708890871245911, + "grad_norm": 1.6155118942260742, + "learning_rate": 2.948870392390012e-07, + "loss": 0.3304, + "step": 4087 + }, + { + "epoch": 9.711269699672911, + "grad_norm": 1.5439618825912476, + "learning_rate": 2.925089179548157e-07, + "loss": 0.3575, + "step": 4088 + }, + { + "epoch": 9.713648528099911, + "grad_norm": 1.5752452611923218, + "learning_rate": 2.9013079667063023e-07, + "loss": 0.3096, + "step": 4089 + }, + { + "epoch": 9.71602735652691, + "grad_norm": 1.6594899892807007, + "learning_rate": 2.8775267538644473e-07, + "loss": 0.3535, + "step": 4090 + }, + { + "epoch": 9.71840618495391, + "grad_norm": 1.5438857078552246, + "learning_rate": 2.8537455410225924e-07, + "loss": 0.2638, + "step": 4091 + }, + { + "epoch": 9.72078501338091, + "grad_norm": 1.8579200506210327, + "learning_rate": 2.8299643281807374e-07, + "loss": 0.3495, + "step": 4092 + }, + { + "epoch": 9.72316384180791, + "grad_norm": 1.5831332206726074, + "learning_rate": 2.8061831153388825e-07, + "loss": 0.3666, + "step": 4093 + }, + { + "epoch": 9.725542670234908, + "grad_norm": 1.485753059387207, + "learning_rate": 2.7824019024970275e-07, + "loss": 0.2658, + "step": 4094 + }, + { + "epoch": 9.727921498661908, + "grad_norm": 1.4695202112197876, + "learning_rate": 2.7586206896551726e-07, + "loss": 0.301, + "step": 4095 + }, + { + "epoch": 9.730300327088909, + "grad_norm": 1.7288553714752197, + "learning_rate": 2.7348394768133176e-07, + "loss": 0.3276, + "step": 4096 + }, + { + "epoch": 9.732679155515909, + "grad_norm": 1.3838896751403809, + "learning_rate": 2.7110582639714627e-07, + "loss": 0.2819, + "step": 4097 + }, + { + "epoch": 9.735057983942909, + "grad_norm": 1.5792163610458374, + "learning_rate": 2.687277051129608e-07, + "loss": 0.2795, + "step": 4098 + }, + { + "epoch": 9.737436812369907, + "grad_norm": 1.4303656816482544, + "learning_rate": 2.663495838287753e-07, + "loss": 0.2528, + "step": 4099 + }, + { + "epoch": 9.739815640796907, + "grad_norm": 1.7509078979492188, + "learning_rate": 2.639714625445898e-07, + "loss": 0.3487, + "step": 4100 + }, + { + "epoch": 9.739815640796907, + "eval_loss": 0.4274255335330963, + "eval_runtime": 22.9867, + "eval_samples_per_second": 32.541, + "eval_steps_per_second": 16.27, + "step": 4100 + }, + { + "epoch": 9.742194469223907, + "grad_norm": 1.6287565231323242, + "learning_rate": 2.615933412604043e-07, + "loss": 0.3073, + "step": 4101 + }, + { + "epoch": 9.744573297650907, + "grad_norm": 1.5244901180267334, + "learning_rate": 2.5921521997621885e-07, + "loss": 0.3587, + "step": 4102 + }, + { + "epoch": 9.746952126077908, + "grad_norm": 1.5579795837402344, + "learning_rate": 2.568370986920333e-07, + "loss": 0.2852, + "step": 4103 + }, + { + "epoch": 9.749330954504906, + "grad_norm": 1.6678608655929565, + "learning_rate": 2.544589774078478e-07, + "loss": 0.318, + "step": 4104 + }, + { + "epoch": 9.751709782931906, + "grad_norm": 1.351474642753601, + "learning_rate": 2.5208085612366236e-07, + "loss": 0.2558, + "step": 4105 + }, + { + "epoch": 9.754088611358906, + "grad_norm": 1.3038499355316162, + "learning_rate": 2.497027348394768e-07, + "loss": 0.2866, + "step": 4106 + }, + { + "epoch": 9.756467439785906, + "grad_norm": 1.3546950817108154, + "learning_rate": 2.473246135552913e-07, + "loss": 0.3817, + "step": 4107 + }, + { + "epoch": 9.758846268212904, + "grad_norm": 1.4943026304244995, + "learning_rate": 2.449464922711059e-07, + "loss": 0.3282, + "step": 4108 + }, + { + "epoch": 9.761225096639905, + "grad_norm": 1.5175234079360962, + "learning_rate": 2.4256837098692033e-07, + "loss": 0.2443, + "step": 4109 + }, + { + "epoch": 9.763603925066905, + "grad_norm": 1.699013590812683, + "learning_rate": 2.4019024970273484e-07, + "loss": 0.3062, + "step": 4110 + }, + { + "epoch": 9.765982753493905, + "grad_norm": 1.7625179290771484, + "learning_rate": 2.3781212841854937e-07, + "loss": 0.3249, + "step": 4111 + }, + { + "epoch": 9.768361581920903, + "grad_norm": 1.675641655921936, + "learning_rate": 2.3543400713436387e-07, + "loss": 0.3711, + "step": 4112 + }, + { + "epoch": 9.770740410347903, + "grad_norm": 1.6583119630813599, + "learning_rate": 2.3305588585017838e-07, + "loss": 0.3278, + "step": 4113 + }, + { + "epoch": 9.773119238774903, + "grad_norm": 1.474919319152832, + "learning_rate": 2.306777645659929e-07, + "loss": 0.267, + "step": 4114 + }, + { + "epoch": 9.775498067201903, + "grad_norm": 1.7411062717437744, + "learning_rate": 2.2829964328180739e-07, + "loss": 0.2806, + "step": 4115 + }, + { + "epoch": 9.777876895628903, + "grad_norm": 1.5017890930175781, + "learning_rate": 2.259215219976219e-07, + "loss": 0.3137, + "step": 4116 + }, + { + "epoch": 9.780255724055902, + "grad_norm": 1.4970654249191284, + "learning_rate": 2.2354340071343642e-07, + "loss": 0.3381, + "step": 4117 + }, + { + "epoch": 9.782634552482902, + "grad_norm": 1.9278727769851685, + "learning_rate": 2.211652794292509e-07, + "loss": 0.3543, + "step": 4118 + }, + { + "epoch": 9.785013380909902, + "grad_norm": 1.350045919418335, + "learning_rate": 2.187871581450654e-07, + "loss": 0.2827, + "step": 4119 + }, + { + "epoch": 9.787392209336902, + "grad_norm": 1.6073265075683594, + "learning_rate": 2.1640903686087994e-07, + "loss": 0.3123, + "step": 4120 + }, + { + "epoch": 9.7897710377639, + "grad_norm": 1.5686168670654297, + "learning_rate": 2.1403091557669442e-07, + "loss": 0.2273, + "step": 4121 + }, + { + "epoch": 9.7921498661909, + "grad_norm": 1.5529402494430542, + "learning_rate": 2.1165279429250892e-07, + "loss": 0.2845, + "step": 4122 + }, + { + "epoch": 9.7945286946179, + "grad_norm": 1.6613699197769165, + "learning_rate": 2.0927467300832345e-07, + "loss": 0.3466, + "step": 4123 + }, + { + "epoch": 9.7969075230449, + "grad_norm": 1.703978180885315, + "learning_rate": 2.0689655172413796e-07, + "loss": 0.2859, + "step": 4124 + }, + { + "epoch": 9.7992863514719, + "grad_norm": 1.3371517658233643, + "learning_rate": 2.0451843043995244e-07, + "loss": 0.2697, + "step": 4125 + }, + { + "epoch": 9.8016651798989, + "grad_norm": 1.507546067237854, + "learning_rate": 2.0214030915576697e-07, + "loss": 0.3067, + "step": 4126 + }, + { + "epoch": 9.8040440083259, + "grad_norm": 1.5115079879760742, + "learning_rate": 1.9976218787158147e-07, + "loss": 0.2742, + "step": 4127 + }, + { + "epoch": 9.8064228367529, + "grad_norm": 1.3799859285354614, + "learning_rate": 1.9738406658739595e-07, + "loss": 0.2738, + "step": 4128 + }, + { + "epoch": 9.8088016651799, + "grad_norm": 1.825982928276062, + "learning_rate": 1.9500594530321048e-07, + "loss": 0.3816, + "step": 4129 + }, + { + "epoch": 9.811180493606898, + "grad_norm": 1.6645939350128174, + "learning_rate": 1.92627824019025e-07, + "loss": 0.2643, + "step": 4130 + }, + { + "epoch": 9.813559322033898, + "grad_norm": 1.4886921644210815, + "learning_rate": 1.9024970273483947e-07, + "loss": 0.2783, + "step": 4131 + }, + { + "epoch": 9.815938150460898, + "grad_norm": 1.7378298044204712, + "learning_rate": 1.87871581450654e-07, + "loss": 0.3083, + "step": 4132 + }, + { + "epoch": 9.818316978887898, + "grad_norm": 1.4903514385223389, + "learning_rate": 1.854934601664685e-07, + "loss": 0.2777, + "step": 4133 + }, + { + "epoch": 9.820695807314898, + "grad_norm": 1.6404743194580078, + "learning_rate": 1.8311533888228303e-07, + "loss": 0.2931, + "step": 4134 + }, + { + "epoch": 9.823074635741897, + "grad_norm": 1.592679500579834, + "learning_rate": 1.8073721759809751e-07, + "loss": 0.2983, + "step": 4135 + }, + { + "epoch": 9.825453464168897, + "grad_norm": 1.5259467363357544, + "learning_rate": 1.7835909631391202e-07, + "loss": 0.321, + "step": 4136 + }, + { + "epoch": 9.827832292595897, + "grad_norm": 1.5546045303344727, + "learning_rate": 1.7598097502972655e-07, + "loss": 0.3181, + "step": 4137 + }, + { + "epoch": 9.830211121022897, + "grad_norm": 1.6925842761993408, + "learning_rate": 1.7360285374554103e-07, + "loss": 0.3212, + "step": 4138 + }, + { + "epoch": 9.832589949449895, + "grad_norm": 1.4672044515609741, + "learning_rate": 1.7122473246135553e-07, + "loss": 0.2932, + "step": 4139 + }, + { + "epoch": 9.834968777876895, + "grad_norm": 1.6418482065200806, + "learning_rate": 1.6884661117717006e-07, + "loss": 0.3411, + "step": 4140 + }, + { + "epoch": 9.837347606303895, + "grad_norm": 1.738821268081665, + "learning_rate": 1.6646848989298457e-07, + "loss": 0.3487, + "step": 4141 + }, + { + "epoch": 9.839726434730895, + "grad_norm": 1.6343719959259033, + "learning_rate": 1.6409036860879905e-07, + "loss": 0.3307, + "step": 4142 + }, + { + "epoch": 9.842105263157894, + "grad_norm": 1.5304145812988281, + "learning_rate": 1.6171224732461358e-07, + "loss": 0.357, + "step": 4143 + }, + { + "epoch": 9.844484091584894, + "grad_norm": 1.8024169206619263, + "learning_rate": 1.5933412604042808e-07, + "loss": 0.3011, + "step": 4144 + }, + { + "epoch": 9.846862920011894, + "grad_norm": 1.6034330129623413, + "learning_rate": 1.5695600475624256e-07, + "loss": 0.3363, + "step": 4145 + }, + { + "epoch": 9.849241748438894, + "grad_norm": 1.7150962352752686, + "learning_rate": 1.545778834720571e-07, + "loss": 0.4267, + "step": 4146 + }, + { + "epoch": 9.851620576865894, + "grad_norm": 1.4316542148590088, + "learning_rate": 1.521997621878716e-07, + "loss": 0.2777, + "step": 4147 + }, + { + "epoch": 9.853999405292893, + "grad_norm": 2.0502235889434814, + "learning_rate": 1.498216409036861e-07, + "loss": 0.3571, + "step": 4148 + }, + { + "epoch": 9.856378233719893, + "grad_norm": 1.79032564163208, + "learning_rate": 1.474435196195006e-07, + "loss": 0.3477, + "step": 4149 + }, + { + "epoch": 9.858757062146893, + "grad_norm": 1.537423014640808, + "learning_rate": 1.4506539833531511e-07, + "loss": 0.357, + "step": 4150 + }, + { + "epoch": 9.858757062146893, + "eval_loss": 0.42732474207878113, + "eval_runtime": 22.6232, + "eval_samples_per_second": 33.063, + "eval_steps_per_second": 16.532, + "step": 4150 + }, + { + "epoch": 9.861135890573893, + "grad_norm": 1.5236681699752808, + "learning_rate": 1.4268727705112962e-07, + "loss": 0.3661, + "step": 4151 + }, + { + "epoch": 9.863514719000893, + "grad_norm": 1.7450720071792603, + "learning_rate": 1.4030915576694412e-07, + "loss": 0.3781, + "step": 4152 + }, + { + "epoch": 9.865893547427891, + "grad_norm": 1.511458158493042, + "learning_rate": 1.3793103448275863e-07, + "loss": 0.3016, + "step": 4153 + }, + { + "epoch": 9.868272375854891, + "grad_norm": 1.6669831275939941, + "learning_rate": 1.3555291319857313e-07, + "loss": 0.3711, + "step": 4154 + }, + { + "epoch": 9.870651204281891, + "grad_norm": 1.5299371480941772, + "learning_rate": 1.3317479191438764e-07, + "loss": 0.278, + "step": 4155 + }, + { + "epoch": 9.873030032708892, + "grad_norm": 1.6824415922164917, + "learning_rate": 1.3079667063020214e-07, + "loss": 0.338, + "step": 4156 + }, + { + "epoch": 9.87540886113589, + "grad_norm": 1.7418583631515503, + "learning_rate": 1.2841854934601665e-07, + "loss": 0.3889, + "step": 4157 + }, + { + "epoch": 9.87778768956289, + "grad_norm": 1.472227931022644, + "learning_rate": 1.2604042806183118e-07, + "loss": 0.2714, + "step": 4158 + }, + { + "epoch": 9.88016651798989, + "grad_norm": 1.7458977699279785, + "learning_rate": 1.2366230677764566e-07, + "loss": 0.3638, + "step": 4159 + }, + { + "epoch": 9.88254534641689, + "grad_norm": 1.5682693719863892, + "learning_rate": 1.2128418549346017e-07, + "loss": 0.3201, + "step": 4160 + }, + { + "epoch": 9.884924174843889, + "grad_norm": 1.572015643119812, + "learning_rate": 1.1890606420927468e-07, + "loss": 0.337, + "step": 4161 + }, + { + "epoch": 9.887303003270889, + "grad_norm": 1.7078760862350464, + "learning_rate": 1.1652794292508919e-07, + "loss": 0.3495, + "step": 4162 + }, + { + "epoch": 9.889681831697889, + "grad_norm": 1.5125267505645752, + "learning_rate": 1.1414982164090369e-07, + "loss": 0.2722, + "step": 4163 + }, + { + "epoch": 9.892060660124889, + "grad_norm": 1.614736557006836, + "learning_rate": 1.1177170035671821e-07, + "loss": 0.3697, + "step": 4164 + }, + { + "epoch": 9.894439488551889, + "grad_norm": 1.5840911865234375, + "learning_rate": 1.093935790725327e-07, + "loss": 0.2588, + "step": 4165 + }, + { + "epoch": 9.896818316978887, + "grad_norm": 1.5776455402374268, + "learning_rate": 1.0701545778834721e-07, + "loss": 0.261, + "step": 4166 + }, + { + "epoch": 9.899197145405887, + "grad_norm": 1.7534818649291992, + "learning_rate": 1.0463733650416173e-07, + "loss": 0.3049, + "step": 4167 + }, + { + "epoch": 9.901575973832887, + "grad_norm": 1.5528936386108398, + "learning_rate": 1.0225921521997622e-07, + "loss": 0.3126, + "step": 4168 + }, + { + "epoch": 9.903954802259888, + "grad_norm": 1.3336870670318604, + "learning_rate": 9.988109393579074e-08, + "loss": 0.2538, + "step": 4169 + }, + { + "epoch": 9.906333630686886, + "grad_norm": 1.5471694469451904, + "learning_rate": 9.750297265160524e-08, + "loss": 0.3154, + "step": 4170 + }, + { + "epoch": 9.908712459113886, + "grad_norm": 1.5014779567718506, + "learning_rate": 9.512485136741973e-08, + "loss": 0.3085, + "step": 4171 + }, + { + "epoch": 9.911091287540886, + "grad_norm": 1.5338668823242188, + "learning_rate": 9.274673008323425e-08, + "loss": 0.2889, + "step": 4172 + }, + { + "epoch": 9.913470115967886, + "grad_norm": 1.5645641088485718, + "learning_rate": 9.036860879904876e-08, + "loss": 0.3083, + "step": 4173 + }, + { + "epoch": 9.915848944394886, + "grad_norm": 1.6276915073394775, + "learning_rate": 8.799048751486327e-08, + "loss": 0.2801, + "step": 4174 + }, + { + "epoch": 9.918227772821885, + "grad_norm": 1.6552796363830566, + "learning_rate": 8.561236623067777e-08, + "loss": 0.3755, + "step": 4175 + }, + { + "epoch": 9.920606601248885, + "grad_norm": 1.8738993406295776, + "learning_rate": 8.323424494649228e-08, + "loss": 0.3255, + "step": 4176 + }, + { + "epoch": 9.922985429675885, + "grad_norm": 1.7143714427947998, + "learning_rate": 8.085612366230679e-08, + "loss": 0.2985, + "step": 4177 + }, + { + "epoch": 9.925364258102885, + "grad_norm": 1.6551828384399414, + "learning_rate": 7.847800237812128e-08, + "loss": 0.3272, + "step": 4178 + }, + { + "epoch": 9.927743086529883, + "grad_norm": 1.813767671585083, + "learning_rate": 7.60998810939358e-08, + "loss": 0.3152, + "step": 4179 + }, + { + "epoch": 9.930121914956883, + "grad_norm": 1.611672282218933, + "learning_rate": 7.37217598097503e-08, + "loss": 0.3257, + "step": 4180 + }, + { + "epoch": 9.932500743383883, + "grad_norm": 1.6380794048309326, + "learning_rate": 7.134363852556481e-08, + "loss": 0.2522, + "step": 4181 + }, + { + "epoch": 9.934879571810884, + "grad_norm": 1.5607755184173584, + "learning_rate": 6.896551724137931e-08, + "loss": 0.3203, + "step": 4182 + }, + { + "epoch": 9.937258400237884, + "grad_norm": 1.6128185987472534, + "learning_rate": 6.658739595719382e-08, + "loss": 0.3836, + "step": 4183 + }, + { + "epoch": 9.939637228664882, + "grad_norm": 1.847948670387268, + "learning_rate": 6.420927467300833e-08, + "loss": 0.3655, + "step": 4184 + }, + { + "epoch": 9.942016057091882, + "grad_norm": 1.6713145971298218, + "learning_rate": 6.183115338882283e-08, + "loss": 0.3366, + "step": 4185 + }, + { + "epoch": 9.944394885518882, + "grad_norm": 1.5752474069595337, + "learning_rate": 5.945303210463734e-08, + "loss": 0.2994, + "step": 4186 + }, + { + "epoch": 9.946773713945882, + "grad_norm": 1.7897475957870483, + "learning_rate": 5.7074910820451847e-08, + "loss": 0.3256, + "step": 4187 + }, + { + "epoch": 9.94915254237288, + "grad_norm": 1.6531521081924438, + "learning_rate": 5.469678953626635e-08, + "loss": 0.354, + "step": 4188 + }, + { + "epoch": 9.95153137079988, + "grad_norm": 1.5660436153411865, + "learning_rate": 5.231866825208086e-08, + "loss": 0.3059, + "step": 4189 + }, + { + "epoch": 9.95391019922688, + "grad_norm": 1.6248193979263306, + "learning_rate": 4.994054696789537e-08, + "loss": 0.3255, + "step": 4190 + }, + { + "epoch": 9.956289027653881, + "grad_norm": 1.6425484418869019, + "learning_rate": 4.756242568370987e-08, + "loss": 0.3244, + "step": 4191 + }, + { + "epoch": 9.95866785608088, + "grad_norm": 1.396389365196228, + "learning_rate": 4.518430439952438e-08, + "loss": 0.2881, + "step": 4192 + }, + { + "epoch": 9.96104668450788, + "grad_norm": 1.528059959411621, + "learning_rate": 4.2806183115338883e-08, + "loss": 0.2998, + "step": 4193 + }, + { + "epoch": 9.96342551293488, + "grad_norm": 1.8317033052444458, + "learning_rate": 4.0428061831153395e-08, + "loss": 0.3514, + "step": 4194 + }, + { + "epoch": 9.96580434136188, + "grad_norm": 1.4546871185302734, + "learning_rate": 3.80499405469679e-08, + "loss": 0.3384, + "step": 4195 + }, + { + "epoch": 9.96818316978888, + "grad_norm": 1.50650954246521, + "learning_rate": 3.5671819262782405e-08, + "loss": 0.3036, + "step": 4196 + }, + { + "epoch": 9.970561998215878, + "grad_norm": 1.5276455879211426, + "learning_rate": 3.329369797859691e-08, + "loss": 0.3258, + "step": 4197 + }, + { + "epoch": 9.972940826642878, + "grad_norm": 1.5051987171173096, + "learning_rate": 3.0915576694411415e-08, + "loss": 0.347, + "step": 4198 + }, + { + "epoch": 9.975319655069878, + "grad_norm": 1.4506568908691406, + "learning_rate": 2.8537455410225923e-08, + "loss": 0.2562, + "step": 4199 + }, + { + "epoch": 9.977698483496878, + "grad_norm": 1.757292628288269, + "learning_rate": 2.615933412604043e-08, + "loss": 0.318, + "step": 4200 + }, + { + "epoch": 9.977698483496878, + "eval_loss": 0.42730140686035156, + "eval_runtime": 22.5362, + "eval_samples_per_second": 33.191, + "eval_steps_per_second": 16.596, + "step": 4200 + }, + { + "epoch": 9.980077311923878, + "grad_norm": 1.5834832191467285, + "learning_rate": 2.3781212841854933e-08, + "loss": 0.3622, + "step": 4201 + }, + { + "epoch": 9.982456140350877, + "grad_norm": 1.608739972114563, + "learning_rate": 2.1403091557669442e-08, + "loss": 0.322, + "step": 4202 + }, + { + "epoch": 9.984834968777877, + "grad_norm": 1.5913673639297485, + "learning_rate": 1.902497027348395e-08, + "loss": 0.3077, + "step": 4203 + }, + { + "epoch": 9.987213797204877, + "grad_norm": 1.6187902688980103, + "learning_rate": 1.6646848989298455e-08, + "loss": 0.3828, + "step": 4204 + }, + { + "epoch": 9.989592625631877, + "grad_norm": 1.4376949071884155, + "learning_rate": 1.4268727705112962e-08, + "loss": 0.3034, + "step": 4205 + }, + { + "epoch": 9.991971454058875, + "grad_norm": 1.6035895347595215, + "learning_rate": 1.1890606420927467e-08, + "loss": 0.3186, + "step": 4206 + }, + { + "epoch": 9.994350282485875, + "grad_norm": 1.4125486612319946, + "learning_rate": 9.512485136741975e-09, + "loss": 0.331, + "step": 4207 + }, + { + "epoch": 9.996729110912876, + "grad_norm": 1.887389063835144, + "learning_rate": 7.134363852556481e-09, + "loss": 0.3261, + "step": 4208 + }, + { + "epoch": 9.999107939339876, + "grad_norm": 1.7470370531082153, + "learning_rate": 4.7562425683709875e-09, + "loss": 0.3334, + "step": 4209 + }, + { + "epoch": 10.0, + "grad_norm": 2.620931625366211, + "learning_rate": 2.3781212841854937e-09, + "loss": 0.2638, + "step": 4210 + } + ], + "logging_steps": 1, + "max_steps": 4210, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.4633739894558925e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}