{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.759526938239159, "eval_steps": 10240, "global_step": 2100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001314060446780552, "grad_norm": 6.0887322425842285, "learning_rate": 1.4492753623188408e-07, "loss": 1.0770764350891113, "memory(GiB)": 39.15, "step": 1, "token_acc": 0.7469458987783595, "train_speed(iter/s)": 0.013712 }, { "epoch": 0.006570302233902759, "grad_norm": 6.300453186035156, "learning_rate": 7.246376811594204e-07, "loss": 1.0886579751968384, "memory(GiB)": 84.21, "step": 5, "token_acc": 0.7434342087721637, "train_speed(iter/s)": 0.026613 }, { "epoch": 0.013140604467805518, "grad_norm": 5.09555196762085, "learning_rate": 1.4492753623188408e-06, "loss": 1.0676928520202638, "memory(GiB)": 84.21, "step": 10, "token_acc": 0.740495867768595, "train_speed(iter/s)": 0.030047 }, { "epoch": 0.01971090670170828, "grad_norm": 3.2556324005126953, "learning_rate": 2.173913043478261e-06, "loss": 0.9635882377624512, "memory(GiB)": 84.21, "step": 15, "token_acc": 0.7701234008830522, "train_speed(iter/s)": 0.031287 }, { "epoch": 0.026281208935611037, "grad_norm": 1.6578456163406372, "learning_rate": 2.8985507246376816e-06, "loss": 0.8276536941528321, "memory(GiB)": 84.21, "step": 20, "token_acc": 0.7827149763702537, "train_speed(iter/s)": 0.031906 }, { "epoch": 0.0328515111695138, "grad_norm": 1.177905797958374, "learning_rate": 3.6231884057971017e-06, "loss": 0.7361048221588135, "memory(GiB)": 84.21, "step": 25, "token_acc": 0.7906462683962538, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.03942181340341656, "grad_norm": 0.6714381575584412, "learning_rate": 4.347826086956522e-06, "loss": 0.6780746459960938, "memory(GiB)": 84.21, "step": 30, "token_acc": 0.8038660725039143, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.045992115637319315, "grad_norm": 0.6768696904182434, "learning_rate": 5.072463768115943e-06, "loss": 0.6487759590148926, "memory(GiB)": 84.21, "step": 35, "token_acc": 0.833314147576839, "train_speed(iter/s)": 0.03272 }, { "epoch": 0.052562417871222074, "grad_norm": 0.5112195611000061, "learning_rate": 5.797101449275363e-06, "loss": 0.6321969032287598, "memory(GiB)": 84.21, "step": 40, "token_acc": 0.8408800826596973, "train_speed(iter/s)": 0.032903 }, { "epoch": 0.05913272010512484, "grad_norm": 0.4675757586956024, "learning_rate": 6.521739130434783e-06, "loss": 0.6117629528045654, "memory(GiB)": 84.21, "step": 45, "token_acc": 0.8229637648856907, "train_speed(iter/s)": 0.033031 }, { "epoch": 0.0657030223390276, "grad_norm": 0.4107670783996582, "learning_rate": 7.246376811594203e-06, "loss": 0.5980951309204101, "memory(GiB)": 84.21, "step": 50, "token_acc": 0.8370074882776961, "train_speed(iter/s)": 0.033154 }, { "epoch": 0.07227332457293036, "grad_norm": 0.37559813261032104, "learning_rate": 7.971014492753623e-06, "loss": 0.5822395801544189, "memory(GiB)": 84.21, "step": 55, "token_acc": 0.8410229088971763, "train_speed(iter/s)": 0.033226 }, { "epoch": 0.07884362680683311, "grad_norm": 0.4154057502746582, "learning_rate": 8.695652173913044e-06, "loss": 0.5758543968200683, "memory(GiB)": 84.21, "step": 60, "token_acc": 0.8529284789178299, "train_speed(iter/s)": 0.033321 }, { "epoch": 0.08541392904073587, "grad_norm": 0.42753836512565613, "learning_rate": 9.420289855072464e-06, "loss": 0.5728845596313477, "memory(GiB)": 84.21, "step": 65, "token_acc": 0.8387224954055531, "train_speed(iter/s)": 0.033349 }, { "epoch": 0.09198423127463863, "grad_norm": 0.42457839846611023, "learning_rate": 9.999994966333388e-06, "loss": 0.564476203918457, "memory(GiB)": 84.21, "step": 70, "token_acc": 0.8388107377603047, "train_speed(iter/s)": 0.033448 }, { "epoch": 0.09855453350854139, "grad_norm": 0.36299943923950195, "learning_rate": 9.999818789066164e-06, "loss": 0.555049991607666, "memory(GiB)": 84.21, "step": 75, "token_acc": 0.8408914844169001, "train_speed(iter/s)": 0.033455 }, { "epoch": 0.10512483574244415, "grad_norm": 0.3913320302963257, "learning_rate": 9.99939093860338e-06, "loss": 0.5565983772277832, "memory(GiB)": 84.21, "step": 80, "token_acc": 0.8320722084099016, "train_speed(iter/s)": 0.03343 }, { "epoch": 0.1116951379763469, "grad_norm": 0.36235758662223816, "learning_rate": 9.998711436481519e-06, "loss": 0.5525528907775878, "memory(GiB)": 84.21, "step": 85, "token_acc": 0.8355197947641537, "train_speed(iter/s)": 0.033468 }, { "epoch": 0.11826544021024968, "grad_norm": 0.38250720500946045, "learning_rate": 9.99778031690431e-06, "loss": 0.5516636848449707, "memory(GiB)": 84.21, "step": 90, "token_acc": 0.8384240551461849, "train_speed(iter/s)": 0.033529 }, { "epoch": 0.12483574244415244, "grad_norm": 0.37428662180900574, "learning_rate": 9.996597626741023e-06, "loss": 0.5437192440032959, "memory(GiB)": 84.21, "step": 95, "token_acc": 0.8406958239587334, "train_speed(iter/s)": 0.033585 }, { "epoch": 0.1314060446780552, "grad_norm": 0.3782438635826111, "learning_rate": 9.995163425524097e-06, "loss": 0.5443241119384765, "memory(GiB)": 84.21, "step": 100, "token_acc": 0.834660268295343, "train_speed(iter/s)": 0.03366 }, { "epoch": 0.13797634691195795, "grad_norm": 0.38486766815185547, "learning_rate": 9.993477785446151e-06, "loss": 0.5410516738891602, "memory(GiB)": 86.38, "step": 105, "token_acc": 0.8449431198379305, "train_speed(iter/s)": 0.033686 }, { "epoch": 0.1445466491458607, "grad_norm": 0.38819748163223267, "learning_rate": 9.991540791356342e-06, "loss": 0.5370469093322754, "memory(GiB)": 86.38, "step": 110, "token_acc": 0.8543880362062181, "train_speed(iter/s)": 0.033712 }, { "epoch": 0.15111695137976347, "grad_norm": 0.39973896741867065, "learning_rate": 9.989352540756103e-06, "loss": 0.5358469486236572, "memory(GiB)": 86.38, "step": 115, "token_acc": 0.8282656701206047, "train_speed(iter/s)": 0.033757 }, { "epoch": 0.15768725361366623, "grad_norm": 0.34199291467666626, "learning_rate": 9.986913143794232e-06, "loss": 0.5350133895874023, "memory(GiB)": 86.38, "step": 120, "token_acc": 0.8469218989280245, "train_speed(iter/s)": 0.033788 }, { "epoch": 0.164257555847569, "grad_norm": 0.41273701190948486, "learning_rate": 9.984222723261344e-06, "loss": 0.5307738304138183, "memory(GiB)": 86.38, "step": 125, "token_acc": 0.8481556913328807, "train_speed(iter/s)": 0.033799 }, { "epoch": 0.17082785808147175, "grad_norm": 0.4566132724285126, "learning_rate": 9.981281414583693e-06, "loss": 0.5298214912414551, "memory(GiB)": 86.38, "step": 130, "token_acc": 0.840121171322787, "train_speed(iter/s)": 0.033812 }, { "epoch": 0.1773981603153745, "grad_norm": 0.3990865647792816, "learning_rate": 9.978089365816357e-06, "loss": 0.5284788131713867, "memory(GiB)": 86.38, "step": 135, "token_acc": 0.844040404040404, "train_speed(iter/s)": 0.033875 }, { "epoch": 0.18396846254927726, "grad_norm": 0.36237913370132446, "learning_rate": 9.974646737635781e-06, "loss": 0.530832576751709, "memory(GiB)": 86.38, "step": 140, "token_acc": 0.8354903823319877, "train_speed(iter/s)": 0.033929 }, { "epoch": 0.19053876478318002, "grad_norm": 0.4100829064846039, "learning_rate": 9.970953703331692e-06, "loss": 0.5266030788421631, "memory(GiB)": 86.38, "step": 145, "token_acc": 0.8457928481723842, "train_speed(iter/s)": 0.033964 }, { "epoch": 0.19710906701708278, "grad_norm": 0.3652012050151825, "learning_rate": 9.967010448798376e-06, "loss": 0.5251831531524658, "memory(GiB)": 86.38, "step": 150, "token_acc": 0.8387645380732939, "train_speed(iter/s)": 0.033991 }, { "epoch": 0.20367936925098554, "grad_norm": 0.39163169264793396, "learning_rate": 9.962817172525323e-06, "loss": 0.5267560958862305, "memory(GiB)": 86.38, "step": 155, "token_acc": 0.8567956034664975, "train_speed(iter/s)": 0.03401 }, { "epoch": 0.2102496714848883, "grad_norm": 0.41479626297950745, "learning_rate": 9.958374085587228e-06, "loss": 0.519415283203125, "memory(GiB)": 86.38, "step": 160, "token_acc": 0.8440078352228884, "train_speed(iter/s)": 0.034022 }, { "epoch": 0.21681997371879105, "grad_norm": 0.3581003248691559, "learning_rate": 9.953681411633376e-06, "loss": 0.5208570480346679, "memory(GiB)": 86.38, "step": 165, "token_acc": 0.8545801997287634, "train_speed(iter/s)": 0.034026 }, { "epoch": 0.2233902759526938, "grad_norm": 0.44018271565437317, "learning_rate": 9.948739386876376e-06, "loss": 0.5224351406097412, "memory(GiB)": 86.38, "step": 170, "token_acc": 0.8500869565217392, "train_speed(iter/s)": 0.034009 }, { "epoch": 0.22996057818659657, "grad_norm": 0.40481236577033997, "learning_rate": 9.943548260080277e-06, "loss": 0.5226601600646973, "memory(GiB)": 86.38, "step": 175, "token_acc": 0.8479028560807881, "train_speed(iter/s)": 0.033953 }, { "epoch": 0.23653088042049936, "grad_norm": 0.3878992199897766, "learning_rate": 9.938108292548044e-06, "loss": 0.5180087566375733, "memory(GiB)": 86.38, "step": 180, "token_acc": 0.8407539640869474, "train_speed(iter/s)": 0.033944 }, { "epoch": 0.24310118265440211, "grad_norm": 0.3512628674507141, "learning_rate": 9.932419758108403e-06, "loss": 0.5186543464660645, "memory(GiB)": 86.38, "step": 185, "token_acc": 0.8444778362133734, "train_speed(iter/s)": 0.03394 }, { "epoch": 0.24967148488830487, "grad_norm": 0.4015056788921356, "learning_rate": 9.92648294310206e-06, "loss": 0.5142830848693848, "memory(GiB)": 86.38, "step": 190, "token_acc": 0.8457510387614549, "train_speed(iter/s)": 0.033952 }, { "epoch": 0.25624178712220763, "grad_norm": 0.4097774624824524, "learning_rate": 9.920298146367287e-06, "loss": 0.5161718368530274, "memory(GiB)": 86.38, "step": 195, "token_acc": 0.8548741619958237, "train_speed(iter/s)": 0.033951 }, { "epoch": 0.2628120893561104, "grad_norm": 0.3440331816673279, "learning_rate": 9.913865679224876e-06, "loss": 0.5165815353393555, "memory(GiB)": 86.38, "step": 200, "token_acc": 0.8569646310273844, "train_speed(iter/s)": 0.033922 }, { "epoch": 0.26938239159001315, "grad_norm": 0.37692517042160034, "learning_rate": 9.907185865462476e-06, "loss": 0.5182360649108887, "memory(GiB)": 86.38, "step": 205, "token_acc": 0.85995085995086, "train_speed(iter/s)": 0.033952 }, { "epoch": 0.2759526938239159, "grad_norm": 0.37486883997917175, "learning_rate": 9.90025904131829e-06, "loss": 0.5185696125030518, "memory(GiB)": 86.38, "step": 210, "token_acc": 0.8403378378378379, "train_speed(iter/s)": 0.033943 }, { "epoch": 0.28252299605781866, "grad_norm": 0.37737980484962463, "learning_rate": 9.893085555464143e-06, "loss": 0.5123628616333008, "memory(GiB)": 86.38, "step": 215, "token_acc": 0.8524216190921853, "train_speed(iter/s)": 0.033965 }, { "epoch": 0.2890932982917214, "grad_norm": 0.4532665014266968, "learning_rate": 9.885665768987947e-06, "loss": 0.5087783813476563, "memory(GiB)": 86.38, "step": 220, "token_acc": 0.8544157346702661, "train_speed(iter/s)": 0.033986 }, { "epoch": 0.2956636005256242, "grad_norm": 0.3860194683074951, "learning_rate": 9.878000055375512e-06, "loss": 0.5123799324035645, "memory(GiB)": 86.38, "step": 225, "token_acc": 0.842546362339515, "train_speed(iter/s)": 0.03397 }, { "epoch": 0.30223390275952694, "grad_norm": 0.3862650692462921, "learning_rate": 9.87008880049175e-06, "loss": 0.50973482131958, "memory(GiB)": 86.38, "step": 230, "token_acc": 0.8520688830423344, "train_speed(iter/s)": 0.033961 }, { "epoch": 0.3088042049934297, "grad_norm": 0.37506306171417236, "learning_rate": 9.861932402561253e-06, "loss": 0.5082354545593262, "memory(GiB)": 86.38, "step": 235, "token_acc": 0.8468783963289458, "train_speed(iter/s)": 0.033958 }, { "epoch": 0.31537450722733246, "grad_norm": 0.3809449076652527, "learning_rate": 9.853531272148248e-06, "loss": 0.5086749076843262, "memory(GiB)": 86.38, "step": 240, "token_acc": 0.8515756420320736, "train_speed(iter/s)": 0.033967 }, { "epoch": 0.3219448094612352, "grad_norm": 0.4132705628871918, "learning_rate": 9.844885832135928e-06, "loss": 0.5116987228393555, "memory(GiB)": 86.38, "step": 245, "token_acc": 0.8335253065925876, "train_speed(iter/s)": 0.033937 }, { "epoch": 0.328515111695138, "grad_norm": 0.4488829970359802, "learning_rate": 9.83599651770517e-06, "loss": 0.5052802085876464, "memory(GiB)": 86.38, "step": 250, "token_acc": 0.8370962333743154, "train_speed(iter/s)": 0.033926 }, { "epoch": 0.33508541392904073, "grad_norm": 0.39081957936286926, "learning_rate": 9.826863776312621e-06, "loss": 0.5067138671875, "memory(GiB)": 86.38, "step": 255, "token_acc": 0.8409980116734013, "train_speed(iter/s)": 0.033929 }, { "epoch": 0.3416557161629435, "grad_norm": 0.35503068566322327, "learning_rate": 9.817488067668186e-06, "loss": 0.503065824508667, "memory(GiB)": 86.38, "step": 260, "token_acc": 0.8459525843656557, "train_speed(iter/s)": 0.033938 }, { "epoch": 0.34822601839684625, "grad_norm": 0.36853545904159546, "learning_rate": 9.807869863711878e-06, "loss": 0.5073853015899659, "memory(GiB)": 86.38, "step": 265, "token_acc": 0.8587078651685394, "train_speed(iter/s)": 0.033943 }, { "epoch": 0.354796320630749, "grad_norm": 0.36008450388908386, "learning_rate": 9.798009648590073e-06, "loss": 0.5045706748962402, "memory(GiB)": 86.38, "step": 270, "token_acc": 0.861764007597341, "train_speed(iter/s)": 0.033959 }, { "epoch": 0.36136662286465177, "grad_norm": 0.3388707637786865, "learning_rate": 9.787907918631125e-06, "loss": 0.5048944473266601, "memory(GiB)": 86.38, "step": 275, "token_acc": 0.8515256760109154, "train_speed(iter/s)": 0.033951 }, { "epoch": 0.3679369250985545, "grad_norm": 0.36713555455207825, "learning_rate": 9.777565182320396e-06, "loss": 0.501971435546875, "memory(GiB)": 86.38, "step": 280, "token_acc": 0.8557236741555861, "train_speed(iter/s)": 0.033953 }, { "epoch": 0.3745072273324573, "grad_norm": 0.3958764970302582, "learning_rate": 9.766981960274653e-06, "loss": 0.5066198825836181, "memory(GiB)": 86.38, "step": 285, "token_acc": 0.8477457935158585, "train_speed(iter/s)": 0.033957 }, { "epoch": 0.38107752956636004, "grad_norm": 0.3786795139312744, "learning_rate": 9.756158785215866e-06, "loss": 0.5043275833129883, "memory(GiB)": 86.38, "step": 290, "token_acc": 0.8627160493827161, "train_speed(iter/s)": 0.033965 }, { "epoch": 0.3876478318002628, "grad_norm": 0.3754529058933258, "learning_rate": 9.745096201944391e-06, "loss": 0.5016345977783203, "memory(GiB)": 86.38, "step": 295, "token_acc": 0.8560241897968678, "train_speed(iter/s)": 0.033947 }, { "epoch": 0.39421813403416556, "grad_norm": 0.32459399104118347, "learning_rate": 9.733794767311545e-06, "loss": 0.5030747890472412, "memory(GiB)": 86.38, "step": 300, "token_acc": 0.8558913059618383, "train_speed(iter/s)": 0.033938 }, { "epoch": 0.4007884362680683, "grad_norm": 0.37864384055137634, "learning_rate": 9.72225505019158e-06, "loss": 0.5041725158691406, "memory(GiB)": 86.38, "step": 305, "token_acc": 0.8588684699566385, "train_speed(iter/s)": 0.033755 }, { "epoch": 0.4073587385019711, "grad_norm": 0.39976298809051514, "learning_rate": 9.710477631453044e-06, "loss": 0.49967308044433595, "memory(GiB)": 86.38, "step": 310, "token_acc": 0.8473580002474941, "train_speed(iter/s)": 0.033765 }, { "epoch": 0.41392904073587383, "grad_norm": 0.4079159200191498, "learning_rate": 9.698463103929542e-06, "loss": 0.5030883312225342, "memory(GiB)": 86.38, "step": 315, "token_acc": 0.8564925878083287, "train_speed(iter/s)": 0.033776 }, { "epoch": 0.4204993429697766, "grad_norm": 0.4643027186393738, "learning_rate": 9.686212072389904e-06, "loss": 0.5033651351928711, "memory(GiB)": 86.38, "step": 320, "token_acc": 0.8536913611894386, "train_speed(iter/s)": 0.033774 }, { "epoch": 0.42706964520367935, "grad_norm": 0.37644535303115845, "learning_rate": 9.673725153507727e-06, "loss": 0.4978950500488281, "memory(GiB)": 86.38, "step": 325, "token_acc": 0.8490523718739487, "train_speed(iter/s)": 0.033788 }, { "epoch": 0.4336399474375821, "grad_norm": 0.3504714369773865, "learning_rate": 9.66100297583035e-06, "loss": 0.503141212463379, "memory(GiB)": 86.38, "step": 330, "token_acc": 0.8508279539713725, "train_speed(iter/s)": 0.033789 }, { "epoch": 0.44021024967148487, "grad_norm": 0.3424312174320221, "learning_rate": 9.6480461797472e-06, "loss": 0.5007185459136962, "memory(GiB)": 86.38, "step": 335, "token_acc": 0.8463611859838275, "train_speed(iter/s)": 0.03379 }, { "epoch": 0.4467805519053876, "grad_norm": 0.3270646631717682, "learning_rate": 9.63485541745757e-06, "loss": 0.4969663143157959, "memory(GiB)": 86.38, "step": 340, "token_acc": 0.8463258785942492, "train_speed(iter/s)": 0.033791 }, { "epoch": 0.4533508541392904, "grad_norm": 0.3828498423099518, "learning_rate": 9.62143135293779e-06, "loss": 0.49769058227539065, "memory(GiB)": 86.38, "step": 345, "token_acc": 0.8501317996645099, "train_speed(iter/s)": 0.033805 }, { "epoch": 0.45992115637319314, "grad_norm": 0.38863444328308105, "learning_rate": 9.607774661907783e-06, "loss": 0.49465193748474123, "memory(GiB)": 86.38, "step": 350, "token_acc": 0.8597788232418891, "train_speed(iter/s)": 0.033787 }, { "epoch": 0.4664914586070959, "grad_norm": 0.34471848607063293, "learning_rate": 9.593886031797081e-06, "loss": 0.4969064712524414, "memory(GiB)": 86.38, "step": 355, "token_acc": 0.8570174985804986, "train_speed(iter/s)": 0.033791 }, { "epoch": 0.4730617608409987, "grad_norm": 0.32791054248809814, "learning_rate": 9.579766161710209e-06, "loss": 0.5029778480529785, "memory(GiB)": 86.38, "step": 360, "token_acc": 0.8601830935679468, "train_speed(iter/s)": 0.033798 }, { "epoch": 0.47963206307490147, "grad_norm": 0.3596540093421936, "learning_rate": 9.565415762391485e-06, "loss": 0.49364757537841797, "memory(GiB)": 86.38, "step": 365, "token_acc": 0.8599964223958023, "train_speed(iter/s)": 0.033808 }, { "epoch": 0.48620236530880423, "grad_norm": 0.3652913570404053, "learning_rate": 9.550835556189264e-06, "loss": 0.4974925994873047, "memory(GiB)": 86.38, "step": 370, "token_acc": 0.8650134518657153, "train_speed(iter/s)": 0.033823 }, { "epoch": 0.492772667542707, "grad_norm": 0.3590964674949646, "learning_rate": 9.536026277019562e-06, "loss": 0.49645166397094725, "memory(GiB)": 86.38, "step": 375, "token_acc": 0.8576561956647734, "train_speed(iter/s)": 0.033837 }, { "epoch": 0.49934296977660975, "grad_norm": 0.3402176797389984, "learning_rate": 9.520988670329114e-06, "loss": 0.4980118751525879, "memory(GiB)": 86.38, "step": 380, "token_acc": 0.8511267926246301, "train_speed(iter/s)": 0.033834 }, { "epoch": 0.5059132720105125, "grad_norm": 0.3765329122543335, "learning_rate": 9.505723493057862e-06, "loss": 0.49571590423583983, "memory(GiB)": 86.38, "step": 385, "token_acc": 0.8535285568175701, "train_speed(iter/s)": 0.033843 }, { "epoch": 0.5124835742444153, "grad_norm": 0.3668725788593292, "learning_rate": 9.490231513600842e-06, "loss": 0.4947934150695801, "memory(GiB)": 86.38, "step": 390, "token_acc": 0.8614418845456899, "train_speed(iter/s)": 0.033846 }, { "epoch": 0.519053876478318, "grad_norm": 0.3342001140117645, "learning_rate": 9.474513511769513e-06, "loss": 0.4992271900177002, "memory(GiB)": 86.38, "step": 395, "token_acc": 0.8471820311423454, "train_speed(iter/s)": 0.033854 }, { "epoch": 0.5256241787122208, "grad_norm": 0.3347104787826538, "learning_rate": 9.458570278752501e-06, "loss": 0.4942744731903076, "memory(GiB)": 86.38, "step": 400, "token_acc": 0.8615504682622268, "train_speed(iter/s)": 0.033853 }, { "epoch": 0.5321944809461235, "grad_norm": 0.3521013855934143, "learning_rate": 9.442402617075765e-06, "loss": 0.4942043304443359, "memory(GiB)": 86.38, "step": 405, "token_acc": 0.8467462686567164, "train_speed(iter/s)": 0.033851 }, { "epoch": 0.5387647831800263, "grad_norm": 0.35290876030921936, "learning_rate": 9.426011340562222e-06, "loss": 0.4902125358581543, "memory(GiB)": 86.38, "step": 410, "token_acc": 0.8508040849865007, "train_speed(iter/s)": 0.033855 }, { "epoch": 0.545335085413929, "grad_norm": 0.3326910436153412, "learning_rate": 9.409397274290756e-06, "loss": 0.4964996337890625, "memory(GiB)": 86.38, "step": 415, "token_acc": 0.8513913558318532, "train_speed(iter/s)": 0.03386 }, { "epoch": 0.5519053876478318, "grad_norm": 0.3406986892223358, "learning_rate": 9.392561254554712e-06, "loss": 0.4953129768371582, "memory(GiB)": 86.38, "step": 420, "token_acc": 0.8444802578565673, "train_speed(iter/s)": 0.03387 }, { "epoch": 0.5584756898817346, "grad_norm": 0.33178892731666565, "learning_rate": 9.375504128819779e-06, "loss": 0.4913620471954346, "memory(GiB)": 86.38, "step": 425, "token_acc": 0.8482620320855615, "train_speed(iter/s)": 0.033876 }, { "epoch": 0.5650459921156373, "grad_norm": 0.33092719316482544, "learning_rate": 9.358226755681342e-06, "loss": 0.4906820297241211, "memory(GiB)": 86.38, "step": 430, "token_acc": 0.8481144343302991, "train_speed(iter/s)": 0.033885 }, { "epoch": 0.5716162943495401, "grad_norm": 0.34297481179237366, "learning_rate": 9.340730004821266e-06, "loss": 0.49637956619262696, "memory(GiB)": 86.38, "step": 435, "token_acc": 0.8484118291347207, "train_speed(iter/s)": 0.03389 }, { "epoch": 0.5781865965834428, "grad_norm": 0.32844671607017517, "learning_rate": 9.323014756964104e-06, "loss": 0.4932809352874756, "memory(GiB)": 86.38, "step": 440, "token_acc": 0.8545686404967842, "train_speed(iter/s)": 0.03389 }, { "epoch": 0.5847568988173456, "grad_norm": 0.3436914086341858, "learning_rate": 9.305081903832784e-06, "loss": 0.49259676933288576, "memory(GiB)": 86.38, "step": 445, "token_acc": 0.8611830312686716, "train_speed(iter/s)": 0.03388 }, { "epoch": 0.5913272010512484, "grad_norm": 0.32494404911994934, "learning_rate": 9.286932348103716e-06, "loss": 0.4914635181427002, "memory(GiB)": 86.38, "step": 450, "token_acc": 0.8426534209261336, "train_speed(iter/s)": 0.033884 }, { "epoch": 0.5978975032851511, "grad_norm": 0.31298619508743286, "learning_rate": 9.268567003361341e-06, "loss": 0.49518795013427735, "memory(GiB)": 86.38, "step": 455, "token_acc": 0.8555702841334794, "train_speed(iter/s)": 0.033881 }, { "epoch": 0.6044678055190539, "grad_norm": 0.3161918818950653, "learning_rate": 9.249986794052168e-06, "loss": 0.4909826278686523, "memory(GiB)": 86.38, "step": 460, "token_acc": 0.8514960996623588, "train_speed(iter/s)": 0.033883 }, { "epoch": 0.6110381077529566, "grad_norm": 0.32942476868629456, "learning_rate": 9.231192655438222e-06, "loss": 0.49195499420166017, "memory(GiB)": 86.38, "step": 465, "token_acc": 0.8575532549189658, "train_speed(iter/s)": 0.033886 }, { "epoch": 0.6176084099868594, "grad_norm": 0.3199692666530609, "learning_rate": 9.21218553354997e-06, "loss": 0.48216657638549804, "memory(GiB)": 86.38, "step": 470, "token_acc": 0.8621787172711987, "train_speed(iter/s)": 0.033885 }, { "epoch": 0.6241787122207622, "grad_norm": 0.33308735489845276, "learning_rate": 9.192966385138714e-06, "loss": 0.49132823944091797, "memory(GiB)": 86.38, "step": 475, "token_acc": 0.8502202643171806, "train_speed(iter/s)": 0.033894 }, { "epoch": 0.6307490144546649, "grad_norm": 0.34672704339027405, "learning_rate": 9.17353617762841e-06, "loss": 0.49529352188110354, "memory(GiB)": 86.38, "step": 480, "token_acc": 0.8439504061564771, "train_speed(iter/s)": 0.033881 }, { "epoch": 0.6373193166885677, "grad_norm": 0.391335666179657, "learning_rate": 9.153895889066988e-06, "loss": 0.4896709442138672, "memory(GiB)": 86.38, "step": 485, "token_acc": 0.8555057299451918, "train_speed(iter/s)": 0.033888 }, { "epoch": 0.6438896189224704, "grad_norm": 0.32497450709342957, "learning_rate": 9.134046508077116e-06, "loss": 0.48676557540893556, "memory(GiB)": 86.38, "step": 490, "token_acc": 0.8605180168536422, "train_speed(iter/s)": 0.033894 }, { "epoch": 0.6504599211563732, "grad_norm": 0.3421924114227295, "learning_rate": 9.113989033806434e-06, "loss": 0.49125194549560547, "memory(GiB)": 86.38, "step": 495, "token_acc": 0.8528348991524867, "train_speed(iter/s)": 0.033897 }, { "epoch": 0.657030223390276, "grad_norm": 0.3321194350719452, "learning_rate": 9.093724475877262e-06, "loss": 0.4898836135864258, "memory(GiB)": 86.38, "step": 500, "token_acc": 0.8522178943084704, "train_speed(iter/s)": 0.033898 }, { "epoch": 0.6636005256241787, "grad_norm": 0.32021504640579224, "learning_rate": 9.073253854335777e-06, "loss": 0.48738608360290525, "memory(GiB)": 86.38, "step": 505, "token_acc": 0.8417130814391088, "train_speed(iter/s)": 0.033901 }, { "epoch": 0.6701708278580815, "grad_norm": 0.32002168893814087, "learning_rate": 9.052578199600675e-06, "loss": 0.49272966384887695, "memory(GiB)": 86.38, "step": 510, "token_acc": 0.8602219376867264, "train_speed(iter/s)": 0.033901 }, { "epoch": 0.6767411300919842, "grad_norm": 0.31045857071876526, "learning_rate": 9.03169855241129e-06, "loss": 0.4898507118225098, "memory(GiB)": 86.38, "step": 515, "token_acc": 0.8575417434522812, "train_speed(iter/s)": 0.033907 }, { "epoch": 0.683311432325887, "grad_norm": 0.3088115453720093, "learning_rate": 9.01061596377522e-06, "loss": 0.4901163578033447, "memory(GiB)": 86.38, "step": 520, "token_acc": 0.8511583445793972, "train_speed(iter/s)": 0.033899 }, { "epoch": 0.6898817345597897, "grad_norm": 0.34883564710617065, "learning_rate": 8.989331494915417e-06, "loss": 0.49116034507751466, "memory(GiB)": 86.38, "step": 525, "token_acc": 0.8551282847735603, "train_speed(iter/s)": 0.033901 }, { "epoch": 0.6964520367936925, "grad_norm": 0.32082292437553406, "learning_rate": 8.967846217216771e-06, "loss": 0.48834967613220215, "memory(GiB)": 86.38, "step": 530, "token_acc": 0.8506810071870131, "train_speed(iter/s)": 0.033906 }, { "epoch": 0.7030223390275953, "grad_norm": 0.3607739806175232, "learning_rate": 8.946161212172172e-06, "loss": 0.48694772720336915, "memory(GiB)": 86.38, "step": 535, "token_acc": 0.8500481340959284, "train_speed(iter/s)": 0.033908 }, { "epoch": 0.709592641261498, "grad_norm": 0.3413682281970978, "learning_rate": 8.924277571328091e-06, "loss": 0.48662757873535156, "memory(GiB)": 86.38, "step": 540, "token_acc": 0.8603295945861269, "train_speed(iter/s)": 0.033909 }, { "epoch": 0.7161629434954008, "grad_norm": 0.3510483503341675, "learning_rate": 8.902196396229605e-06, "loss": 0.48763227462768555, "memory(GiB)": 86.38, "step": 545, "token_acc": 0.8508162458340395, "train_speed(iter/s)": 0.03392 }, { "epoch": 0.7227332457293035, "grad_norm": 0.31174516677856445, "learning_rate": 8.879918798364984e-06, "loss": 0.48741979598999025, "memory(GiB)": 86.38, "step": 550, "token_acc": 0.8652033455768465, "train_speed(iter/s)": 0.033921 }, { "epoch": 0.7293035479632063, "grad_norm": 0.37009692192077637, "learning_rate": 8.857445899109716e-06, "loss": 0.48439769744873046, "memory(GiB)": 86.38, "step": 555, "token_acc": 0.8583586264357556, "train_speed(iter/s)": 0.033917 }, { "epoch": 0.735873850197109, "grad_norm": 0.32648202776908875, "learning_rate": 8.83477882967007e-06, "loss": 0.4858428955078125, "memory(GiB)": 86.38, "step": 560, "token_acc": 0.8660503897045496, "train_speed(iter/s)": 0.03392 }, { "epoch": 0.7424441524310118, "grad_norm": 0.3123824894428253, "learning_rate": 8.81191873102616e-06, "loss": 0.4876396179199219, "memory(GiB)": 86.38, "step": 565, "token_acc": 0.8565744150136596, "train_speed(iter/s)": 0.033919 }, { "epoch": 0.7490144546649146, "grad_norm": 0.3010823428630829, "learning_rate": 8.788866753874504e-06, "loss": 0.48569602966308595, "memory(GiB)": 86.38, "step": 570, "token_acc": 0.846796506265936, "train_speed(iter/s)": 0.033922 }, { "epoch": 0.7555847568988173, "grad_norm": 0.32120397686958313, "learning_rate": 8.765624058570106e-06, "loss": 0.4865298271179199, "memory(GiB)": 86.38, "step": 575, "token_acc": 0.8490352484639431, "train_speed(iter/s)": 0.033924 }, { "epoch": 0.7621550591327201, "grad_norm": 0.33722633123397827, "learning_rate": 8.742191815068048e-06, "loss": 0.4867109298706055, "memory(GiB)": 86.38, "step": 580, "token_acc": 0.8612191958495461, "train_speed(iter/s)": 0.033917 }, { "epoch": 0.7687253613666228, "grad_norm": 0.32410791516304016, "learning_rate": 8.718571202864598e-06, "loss": 0.4851318359375, "memory(GiB)": 86.38, "step": 585, "token_acc": 0.8603109706993743, "train_speed(iter/s)": 0.033921 }, { "epoch": 0.7752956636005256, "grad_norm": 0.326885461807251, "learning_rate": 8.69476341093784e-06, "loss": 0.4805999755859375, "memory(GiB)": 86.38, "step": 590, "token_acc": 0.8454463103616473, "train_speed(iter/s)": 0.033929 }, { "epoch": 0.7818659658344284, "grad_norm": 0.3168047070503235, "learning_rate": 8.67076963768782e-06, "loss": 0.48687124252319336, "memory(GiB)": 86.38, "step": 595, "token_acc": 0.8451851851851852, "train_speed(iter/s)": 0.033931 }, { "epoch": 0.7884362680683311, "grad_norm": 0.3170868456363678, "learning_rate": 8.646591090876225e-06, "loss": 0.48125357627868653, "memory(GiB)": 86.38, "step": 600, "token_acc": 0.8502272038776129, "train_speed(iter/s)": 0.033925 }, { "epoch": 0.7950065703022339, "grad_norm": 0.3512137532234192, "learning_rate": 8.622228987565597e-06, "loss": 0.48726634979248046, "memory(GiB)": 86.38, "step": 605, "token_acc": 0.8433869839048286, "train_speed(iter/s)": 0.033836 }, { "epoch": 0.8015768725361366, "grad_norm": 0.34979116916656494, "learning_rate": 8.597684554058053e-06, "loss": 0.4839656829833984, "memory(GiB)": 86.38, "step": 610, "token_acc": 0.8488303749853062, "train_speed(iter/s)": 0.033836 }, { "epoch": 0.8081471747700394, "grad_norm": 0.33397239446640015, "learning_rate": 8.572959025833573e-06, "loss": 0.4833966255187988, "memory(GiB)": 86.38, "step": 615, "token_acc": 0.8552229366501528, "train_speed(iter/s)": 0.03383 }, { "epoch": 0.8147174770039421, "grad_norm": 0.31006062030792236, "learning_rate": 8.548053647487808e-06, "loss": 0.4889863967895508, "memory(GiB)": 86.38, "step": 620, "token_acc": 0.8452540855160062, "train_speed(iter/s)": 0.033832 }, { "epoch": 0.8212877792378449, "grad_norm": 0.3102535307407379, "learning_rate": 8.522969672669419e-06, "loss": 0.48553314208984377, "memory(GiB)": 86.38, "step": 625, "token_acc": 0.8545072273324573, "train_speed(iter/s)": 0.033831 }, { "epoch": 0.8278580814717477, "grad_norm": 0.3058727979660034, "learning_rate": 8.49770836401699e-06, "loss": 0.47721147537231445, "memory(GiB)": 86.38, "step": 630, "token_acc": 0.8590224444841341, "train_speed(iter/s)": 0.033834 }, { "epoch": 0.8344283837056504, "grad_norm": 0.3120846152305603, "learning_rate": 8.47227099309546e-06, "loss": 0.48225932121276854, "memory(GiB)": 86.38, "step": 635, "token_acc": 0.854253918870408, "train_speed(iter/s)": 0.033836 }, { "epoch": 0.8409986859395532, "grad_norm": 0.3198888301849365, "learning_rate": 8.446658840332115e-06, "loss": 0.4882974624633789, "memory(GiB)": 86.38, "step": 640, "token_acc": 0.8472647079746746, "train_speed(iter/s)": 0.033841 }, { "epoch": 0.8475689881734559, "grad_norm": 0.3015914857387543, "learning_rate": 8.420873194952153e-06, "loss": 0.483825159072876, "memory(GiB)": 86.38, "step": 645, "token_acc": 0.8493750329623965, "train_speed(iter/s)": 0.033848 }, { "epoch": 0.8541392904073587, "grad_norm": 0.33040115237236023, "learning_rate": 8.394915354913763e-06, "loss": 0.48243865966796873, "memory(GiB)": 86.38, "step": 650, "token_acc": 0.8504132231404958, "train_speed(iter/s)": 0.033849 }, { "epoch": 0.8607095926412615, "grad_norm": 0.3229842782020569, "learning_rate": 8.368786626842815e-06, "loss": 0.4843127250671387, "memory(GiB)": 86.38, "step": 655, "token_acc": 0.8529356357927786, "train_speed(iter/s)": 0.033853 }, { "epoch": 0.8672798948751642, "grad_norm": 0.31925421953201294, "learning_rate": 8.342488325967068e-06, "loss": 0.48301048278808595, "memory(GiB)": 86.38, "step": 660, "token_acc": 0.8582582960770733, "train_speed(iter/s)": 0.033854 }, { "epoch": 0.873850197109067, "grad_norm": 0.30799737572669983, "learning_rate": 8.31602177604999e-06, "loss": 0.48166284561157224, "memory(GiB)": 86.38, "step": 665, "token_acc": 0.8686445412895295, "train_speed(iter/s)": 0.033865 }, { "epoch": 0.8804204993429697, "grad_norm": 0.31392061710357666, "learning_rate": 8.289388309324094e-06, "loss": 0.483530855178833, "memory(GiB)": 86.38, "step": 670, "token_acc": 0.8583989950896426, "train_speed(iter/s)": 0.033868 }, { "epoch": 0.8869908015768725, "grad_norm": 0.33349302411079407, "learning_rate": 8.262589266423908e-06, "loss": 0.48435115814208984, "memory(GiB)": 86.38, "step": 675, "token_acc": 0.8416313213703099, "train_speed(iter/s)": 0.033874 }, { "epoch": 0.8935611038107752, "grad_norm": 0.3091382086277008, "learning_rate": 8.235625996318475e-06, "loss": 0.4799081802368164, "memory(GiB)": 86.38, "step": 680, "token_acc": 0.8609777777777777, "train_speed(iter/s)": 0.033877 }, { "epoch": 0.900131406044678, "grad_norm": 0.3427553176879883, "learning_rate": 8.208499856243453e-06, "loss": 0.48143601417541504, "memory(GiB)": 86.38, "step": 685, "token_acc": 0.8536925941249482, "train_speed(iter/s)": 0.033876 }, { "epoch": 0.9067017082785808, "grad_norm": 0.3548396825790405, "learning_rate": 8.1812122116328e-06, "loss": 0.48082866668701174, "memory(GiB)": 86.38, "step": 690, "token_acc": 0.8531232091690545, "train_speed(iter/s)": 0.033877 }, { "epoch": 0.9132720105124835, "grad_norm": 0.3253563940525055, "learning_rate": 8.15376443605004e-06, "loss": 0.4795668601989746, "memory(GiB)": 86.38, "step": 695, "token_acc": 0.853655830467103, "train_speed(iter/s)": 0.033883 }, { "epoch": 0.9198423127463863, "grad_norm": 0.2970241606235504, "learning_rate": 8.126157911119124e-06, "loss": 0.479010009765625, "memory(GiB)": 86.38, "step": 700, "token_acc": 0.859375, "train_speed(iter/s)": 0.033891 }, { "epoch": 0.926412614980289, "grad_norm": 0.3558485805988312, "learning_rate": 8.098394026454886e-06, "loss": 0.4783782482147217, "memory(GiB)": 86.38, "step": 705, "token_acc": 0.8596869328493648, "train_speed(iter/s)": 0.033894 }, { "epoch": 0.9329829172141918, "grad_norm": 0.3010825514793396, "learning_rate": 8.070474179593088e-06, "loss": 0.47974371910095215, "memory(GiB)": 86.38, "step": 710, "token_acc": 0.8615735767991407, "train_speed(iter/s)": 0.033899 }, { "epoch": 0.9395532194480947, "grad_norm": 0.31274092197418213, "learning_rate": 8.042399775920084e-06, "loss": 0.48296613693237306, "memory(GiB)": 86.38, "step": 715, "token_acc": 0.8443671593590858, "train_speed(iter/s)": 0.033904 }, { "epoch": 0.9461235216819974, "grad_norm": 0.30195385217666626, "learning_rate": 8.014172228602063e-06, "loss": 0.48566722869873047, "memory(GiB)": 86.38, "step": 720, "token_acc": 0.8442668136714443, "train_speed(iter/s)": 0.033907 }, { "epoch": 0.9526938239159002, "grad_norm": 0.29728612303733826, "learning_rate": 7.985792958513932e-06, "loss": 0.4842525005340576, "memory(GiB)": 86.38, "step": 725, "token_acc": 0.8693410760843802, "train_speed(iter/s)": 0.03391 }, { "epoch": 0.9592641261498029, "grad_norm": 0.3458816707134247, "learning_rate": 7.957263394167778e-06, "loss": 0.47885870933532715, "memory(GiB)": 86.38, "step": 730, "token_acc": 0.8596500419111484, "train_speed(iter/s)": 0.033899 }, { "epoch": 0.9658344283837057, "grad_norm": 0.3230541944503784, "learning_rate": 7.928584971640974e-06, "loss": 0.4798708915710449, "memory(GiB)": 86.38, "step": 735, "token_acc": 0.8699983578739942, "train_speed(iter/s)": 0.033899 }, { "epoch": 0.9724047306176085, "grad_norm": 0.3110128939151764, "learning_rate": 7.899759134503888e-06, "loss": 0.4790318489074707, "memory(GiB)": 86.38, "step": 740, "token_acc": 0.8630462405391968, "train_speed(iter/s)": 0.033907 }, { "epoch": 0.9789750328515112, "grad_norm": 0.3367188274860382, "learning_rate": 7.870787333747216e-06, "loss": 0.47907276153564454, "memory(GiB)": 86.38, "step": 745, "token_acc": 0.8586263243898582, "train_speed(iter/s)": 0.03391 }, { "epoch": 0.985545335085414, "grad_norm": 0.3082112967967987, "learning_rate": 7.841671027708945e-06, "loss": 0.481706428527832, "memory(GiB)": 86.38, "step": 750, "token_acc": 0.8511583011583012, "train_speed(iter/s)": 0.033906 }, { "epoch": 0.9921156373193167, "grad_norm": 0.332453191280365, "learning_rate": 7.81241168200095e-06, "loss": 0.4739673137664795, "memory(GiB)": 86.38, "step": 755, "token_acc": 0.8551256316190212, "train_speed(iter/s)": 0.033908 }, { "epoch": 0.9986859395532195, "grad_norm": 0.28533536195755005, "learning_rate": 7.783010769435216e-06, "loss": 0.4861409664154053, "memory(GiB)": 86.38, "step": 760, "token_acc": 0.8556530110172211, "train_speed(iter/s)": 0.033911 }, { "epoch": 1.0052562417871223, "grad_norm": 0.33001649379730225, "learning_rate": 7.753469769949701e-06, "loss": 0.46169567108154297, "memory(GiB)": 86.38, "step": 765, "token_acc": 0.8602941176470589, "train_speed(iter/s)": 0.033926 }, { "epoch": 1.011826544021025, "grad_norm": 0.305500328540802, "learning_rate": 7.723790170533848e-06, "loss": 0.46022186279296873, "memory(GiB)": 86.38, "step": 770, "token_acc": 0.8599308445173768, "train_speed(iter/s)": 0.033928 }, { "epoch": 1.0183968462549278, "grad_norm": 0.2889300584793091, "learning_rate": 7.693973465153724e-06, "loss": 0.46282401084899905, "memory(GiB)": 86.38, "step": 775, "token_acc": 0.862350683914093, "train_speed(iter/s)": 0.03393 }, { "epoch": 1.0249671484888305, "grad_norm": 0.33990442752838135, "learning_rate": 7.664021154676828e-06, "loss": 0.4604497909545898, "memory(GiB)": 86.38, "step": 780, "token_acc": 0.8684050268504678, "train_speed(iter/s)": 0.033928 }, { "epoch": 1.0315374507227333, "grad_norm": 0.31965604424476624, "learning_rate": 7.633934746796545e-06, "loss": 0.46096210479736327, "memory(GiB)": 86.38, "step": 785, "token_acc": 0.8674027168912702, "train_speed(iter/s)": 0.033927 }, { "epoch": 1.038107752956636, "grad_norm": 0.32439425587654114, "learning_rate": 7.603715755956243e-06, "loss": 0.45728340148925783, "memory(GiB)": 86.38, "step": 790, "token_acc": 0.8674415479709755, "train_speed(iter/s)": 0.033927 }, { "epoch": 1.0446780551905388, "grad_norm": 0.3270528018474579, "learning_rate": 7.573365703273045e-06, "loss": 0.46488609313964846, "memory(GiB)": 86.38, "step": 795, "token_acc": 0.850026525198939, "train_speed(iter/s)": 0.033931 }, { "epoch": 1.0512483574244416, "grad_norm": 0.2934127748012543, "learning_rate": 7.542886116461272e-06, "loss": 0.45778141021728513, "memory(GiB)": 86.38, "step": 800, "token_acc": 0.8622505823964347, "train_speed(iter/s)": 0.033934 }, { "epoch": 1.0578186596583443, "grad_norm": 0.31371569633483887, "learning_rate": 7.512278529755529e-06, "loss": 0.45838513374328616, "memory(GiB)": 86.38, "step": 805, "token_acc": 0.8530397056400681, "train_speed(iter/s)": 0.033931 }, { "epoch": 1.064388961892247, "grad_norm": 0.2872871160507202, "learning_rate": 7.481544483833485e-06, "loss": 0.4574404239654541, "memory(GiB)": 86.38, "step": 810, "token_acc": 0.8523446658851114, "train_speed(iter/s)": 0.033933 }, { "epoch": 1.0709592641261498, "grad_norm": 0.2994791865348816, "learning_rate": 7.450685525738315e-06, "loss": 0.45713510513305666, "memory(GiB)": 86.38, "step": 815, "token_acc": 0.8546161825726141, "train_speed(iter/s)": 0.033938 }, { "epoch": 1.0775295663600526, "grad_norm": 0.29632824659347534, "learning_rate": 7.419703208800839e-06, "loss": 0.45964574813842773, "memory(GiB)": 86.38, "step": 820, "token_acc": 0.8663826261908989, "train_speed(iter/s)": 0.033941 }, { "epoch": 1.0840998685939554, "grad_norm": 0.30519089102745056, "learning_rate": 7.388599092561315e-06, "loss": 0.4573044776916504, "memory(GiB)": 86.38, "step": 825, "token_acc": 0.8629596640793994, "train_speed(iter/s)": 0.033938 }, { "epoch": 1.090670170827858, "grad_norm": 0.29544419050216675, "learning_rate": 7.357374742690956e-06, "loss": 0.45876827239990237, "memory(GiB)": 86.38, "step": 830, "token_acc": 0.8560570320280967, "train_speed(iter/s)": 0.033938 }, { "epoch": 1.0972404730617609, "grad_norm": 0.3168863356113434, "learning_rate": 7.326031730913107e-06, "loss": 0.4636601448059082, "memory(GiB)": 86.38, "step": 835, "token_acc": 0.8670317181527017, "train_speed(iter/s)": 0.033943 }, { "epoch": 1.1038107752956636, "grad_norm": 0.30908459424972534, "learning_rate": 7.2945716349241305e-06, "loss": 0.4574262619018555, "memory(GiB)": 86.38, "step": 840, "token_acc": 0.8620744343412984, "train_speed(iter/s)": 0.03394 }, { "epoch": 1.1103810775295664, "grad_norm": 0.3176266551017761, "learning_rate": 7.262996038314001e-06, "loss": 0.461370849609375, "memory(GiB)": 86.38, "step": 845, "token_acc": 0.8680333119795003, "train_speed(iter/s)": 0.033941 }, { "epoch": 1.1169513797634691, "grad_norm": 0.302416056394577, "learning_rate": 7.231306530486579e-06, "loss": 0.45732645988464354, "memory(GiB)": 86.38, "step": 850, "token_acc": 0.8647487633428794, "train_speed(iter/s)": 0.033935 }, { "epoch": 1.123521681997372, "grad_norm": 0.30254605412483215, "learning_rate": 7.199504706579617e-06, "loss": 0.46079111099243164, "memory(GiB)": 86.38, "step": 855, "token_acc": 0.8521696665271383, "train_speed(iter/s)": 0.033936 }, { "epoch": 1.1300919842312747, "grad_norm": 0.29616811871528625, "learning_rate": 7.167592167384461e-06, "loss": 0.45458307266235354, "memory(GiB)": 86.38, "step": 860, "token_acc": 0.8740381023533806, "train_speed(iter/s)": 0.033936 }, { "epoch": 1.1366622864651774, "grad_norm": 0.2893197238445282, "learning_rate": 7.135570519265473e-06, "loss": 0.4566815853118896, "memory(GiB)": 86.38, "step": 865, "token_acc": 0.8525364274150027, "train_speed(iter/s)": 0.033926 }, { "epoch": 1.1432325886990802, "grad_norm": 0.30079302191734314, "learning_rate": 7.1034413740791705e-06, "loss": 0.4587052345275879, "memory(GiB)": 86.38, "step": 870, "token_acc": 0.8628954358850519, "train_speed(iter/s)": 0.033926 }, { "epoch": 1.149802890932983, "grad_norm": 0.3086967170238495, "learning_rate": 7.071206349093097e-06, "loss": 0.45635190010070803, "memory(GiB)": 86.38, "step": 875, "token_acc": 0.859285550721319, "train_speed(iter/s)": 0.033929 }, { "epoch": 1.1563731931668857, "grad_norm": 0.3067159354686737, "learning_rate": 7.038867066904407e-06, "loss": 0.45715036392211916, "memory(GiB)": 86.38, "step": 880, "token_acc": 0.8618468146027202, "train_speed(iter/s)": 0.033924 }, { "epoch": 1.1629434954007885, "grad_norm": 0.28498393297195435, "learning_rate": 7.006425155358195e-06, "loss": 0.4554757118225098, "memory(GiB)": 86.38, "step": 885, "token_acc": 0.8687036756920284, "train_speed(iter/s)": 0.033921 }, { "epoch": 1.1695137976346912, "grad_norm": 0.2907336950302124, "learning_rate": 6.9738822474655555e-06, "loss": 0.45355930328369143, "memory(GiB)": 86.38, "step": 890, "token_acc": 0.8599979554283378, "train_speed(iter/s)": 0.033924 }, { "epoch": 1.176084099868594, "grad_norm": 0.29509079456329346, "learning_rate": 6.941239981321379e-06, "loss": 0.45787954330444336, "memory(GiB)": 86.38, "step": 895, "token_acc": 0.8733064370446197, "train_speed(iter/s)": 0.033929 }, { "epoch": 1.1826544021024967, "grad_norm": 0.2977595031261444, "learning_rate": 6.908500000021905e-06, "loss": 0.456469202041626, "memory(GiB)": 86.38, "step": 900, "token_acc": 0.8686904761904762, "train_speed(iter/s)": 0.03393 }, { "epoch": 1.1892247043363995, "grad_norm": 0.28251177072525024, "learning_rate": 6.875663951582e-06, "loss": 0.45859241485595703, "memory(GiB)": 86.38, "step": 905, "token_acc": 0.8703482454975884, "train_speed(iter/s)": 0.033871 }, { "epoch": 1.1957950065703022, "grad_norm": 0.30164870619773865, "learning_rate": 6.842733488852218e-06, "loss": 0.45961766242980956, "memory(GiB)": 86.38, "step": 910, "token_acc": 0.8695999018163967, "train_speed(iter/s)": 0.033871 }, { "epoch": 1.202365308804205, "grad_norm": 0.2958962023258209, "learning_rate": 6.80971026943559e-06, "loss": 0.45937299728393555, "memory(GiB)": 86.38, "step": 915, "token_acc": 0.8646010935387148, "train_speed(iter/s)": 0.033864 }, { "epoch": 1.2089356110381078, "grad_norm": 0.305772989988327, "learning_rate": 6.776595955604192e-06, "loss": 0.4570772171020508, "memory(GiB)": 86.38, "step": 920, "token_acc": 0.861623201438849, "train_speed(iter/s)": 0.033866 }, { "epoch": 1.2155059132720105, "grad_norm": 0.29926493763923645, "learning_rate": 6.743392214215473e-06, "loss": 0.45430717468261717, "memory(GiB)": 86.38, "step": 925, "token_acc": 0.8663976363767385, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.2220762155059133, "grad_norm": 0.3044881522655487, "learning_rate": 6.710100716628345e-06, "loss": 0.455517578125, "memory(GiB)": 86.38, "step": 930, "token_acc": 0.8511478910838227, "train_speed(iter/s)": 0.033863 }, { "epoch": 1.228646517739816, "grad_norm": 0.3772009313106537, "learning_rate": 6.676723138619056e-06, "loss": 0.46090059280395507, "memory(GiB)": 86.38, "step": 935, "token_acc": 0.8711496746203905, "train_speed(iter/s)": 0.033858 }, { "epoch": 1.2352168199737188, "grad_norm": 0.29388174414634705, "learning_rate": 6.6432611602968445e-06, "loss": 0.456877326965332, "memory(GiB)": 86.38, "step": 940, "token_acc": 0.8651419558359621, "train_speed(iter/s)": 0.03386 }, { "epoch": 1.2417871222076216, "grad_norm": 0.29652050137519836, "learning_rate": 6.609716466019356e-06, "loss": 0.45618433952331544, "memory(GiB)": 86.38, "step": 945, "token_acc": 0.8603668915085418, "train_speed(iter/s)": 0.033859 }, { "epoch": 1.2483574244415243, "grad_norm": 0.28154268860816956, "learning_rate": 6.576090744307866e-06, "loss": 0.45843868255615233, "memory(GiB)": 86.38, "step": 950, "token_acc": 0.8659341793046529, "train_speed(iter/s)": 0.033861 }, { "epoch": 1.254927726675427, "grad_norm": 0.284541517496109, "learning_rate": 6.542385687762287e-06, "loss": 0.4614737033843994, "memory(GiB)": 86.38, "step": 955, "token_acc": 0.8557089929269114, "train_speed(iter/s)": 0.033856 }, { "epoch": 1.2614980289093298, "grad_norm": 0.2883804142475128, "learning_rate": 6.508602992975963e-06, "loss": 0.4575353622436523, "memory(GiB)": 86.38, "step": 960, "token_acc": 0.862012703222423, "train_speed(iter/s)": 0.03385 }, { "epoch": 1.2680683311432326, "grad_norm": 0.2853713035583496, "learning_rate": 6.474744360450274e-06, "loss": 0.4590480804443359, "memory(GiB)": 86.38, "step": 965, "token_acc": 0.8613731343283582, "train_speed(iter/s)": 0.033851 }, { "epoch": 1.2746386333771353, "grad_norm": 0.2936136722564697, "learning_rate": 6.44081149450904e-06, "loss": 0.45726985931396485, "memory(GiB)": 86.38, "step": 970, "token_acc": 0.8545799374647599, "train_speed(iter/s)": 0.033849 }, { "epoch": 1.281208935611038, "grad_norm": 0.31412455439567566, "learning_rate": 6.406806103212725e-06, "loss": 0.45641331672668456, "memory(GiB)": 86.38, "step": 975, "token_acc": 0.8715530697190427, "train_speed(iter/s)": 0.033845 }, { "epoch": 1.2877792378449409, "grad_norm": 0.31974250078201294, "learning_rate": 6.372729898272463e-06, "loss": 0.46121625900268554, "memory(GiB)": 86.38, "step": 980, "token_acc": 0.8484265561803295, "train_speed(iter/s)": 0.033852 }, { "epoch": 1.2943495400788436, "grad_norm": 0.29389360547065735, "learning_rate": 6.338584594963898e-06, "loss": 0.4556922435760498, "memory(GiB)": 86.38, "step": 985, "token_acc": 0.8639753820476712, "train_speed(iter/s)": 0.033848 }, { "epoch": 1.3009198423127464, "grad_norm": 0.30771321058273315, "learning_rate": 6.30437191204084e-06, "loss": 0.46083745956420896, "memory(GiB)": 86.38, "step": 990, "token_acc": 0.8666952159549737, "train_speed(iter/s)": 0.033849 }, { "epoch": 1.3074901445466491, "grad_norm": 0.29386404156684875, "learning_rate": 6.270093571648752e-06, "loss": 0.45865530967712403, "memory(GiB)": 86.38, "step": 995, "token_acc": 0.8546142578125, "train_speed(iter/s)": 0.033849 }, { "epoch": 1.314060446780552, "grad_norm": 0.2929444909095764, "learning_rate": 6.23575129923806e-06, "loss": 0.45972671508789065, "memory(GiB)": 86.38, "step": 1000, "token_acc": 0.8530415342981528, "train_speed(iter/s)": 0.033851 }, { "epoch": 1.3206307490144547, "grad_norm": 0.2973506450653076, "learning_rate": 6.2013468234773034e-06, "loss": 0.45803632736206057, "memory(GiB)": 86.38, "step": 1005, "token_acc": 0.857928142355208, "train_speed(iter/s)": 0.033853 }, { "epoch": 1.3272010512483574, "grad_norm": 0.30529940128326416, "learning_rate": 6.166881876166119e-06, "loss": 0.4576756000518799, "memory(GiB)": 86.38, "step": 1010, "token_acc": 0.8755669493196608, "train_speed(iter/s)": 0.03385 }, { "epoch": 1.3337713534822602, "grad_norm": 0.293550968170166, "learning_rate": 6.132358192148065e-06, "loss": 0.4561765670776367, "memory(GiB)": 86.38, "step": 1015, "token_acc": 0.8672781599610868, "train_speed(iter/s)": 0.033849 }, { "epoch": 1.340341655716163, "grad_norm": 0.29839423298835754, "learning_rate": 6.097777509223299e-06, "loss": 0.455903148651123, "memory(GiB)": 86.38, "step": 1020, "token_acc": 0.8684119278779473, "train_speed(iter/s)": 0.033847 }, { "epoch": 1.3469119579500657, "grad_norm": 0.3058245778083801, "learning_rate": 6.063141568061104e-06, "loss": 0.4578727722167969, "memory(GiB)": 86.38, "step": 1025, "token_acc": 0.8626132709733996, "train_speed(iter/s)": 0.033852 }, { "epoch": 1.3534822601839684, "grad_norm": 0.2938694357872009, "learning_rate": 6.02845211211226e-06, "loss": 0.45619792938232423, "memory(GiB)": 86.38, "step": 1030, "token_acc": 0.864321608040201, "train_speed(iter/s)": 0.033855 }, { "epoch": 1.3600525624178712, "grad_norm": 0.33827096223831177, "learning_rate": 5.993710887521302e-06, "loss": 0.45999650955200194, "memory(GiB)": 86.38, "step": 1035, "token_acc": 0.8575886524822695, "train_speed(iter/s)": 0.033856 }, { "epoch": 1.366622864651774, "grad_norm": 0.2824879586696625, "learning_rate": 5.958919643038609e-06, "loss": 0.45719089508056643, "memory(GiB)": 86.38, "step": 1040, "token_acc": 0.8549390889830508, "train_speed(iter/s)": 0.033856 }, { "epoch": 1.3731931668856767, "grad_norm": 0.2904459238052368, "learning_rate": 5.924080129932386e-06, "loss": 0.4534614562988281, "memory(GiB)": 86.38, "step": 1045, "token_acc": 0.8642217245240762, "train_speed(iter/s)": 0.033848 }, { "epoch": 1.3797634691195795, "grad_norm": 0.31164076924324036, "learning_rate": 5.8891941019005095e-06, "loss": 0.4557456970214844, "memory(GiB)": 86.38, "step": 1050, "token_acc": 0.8531942479962282, "train_speed(iter/s)": 0.033847 }, { "epoch": 1.3863337713534822, "grad_norm": 0.2827838063240051, "learning_rate": 5.854263314982252e-06, "loss": 0.4562164306640625, "memory(GiB)": 86.38, "step": 1055, "token_acc": 0.8564340588988476, "train_speed(iter/s)": 0.033846 }, { "epoch": 1.392904073587385, "grad_norm": 0.29443469643592834, "learning_rate": 5.819289527469897e-06, "loss": 0.45438013076782224, "memory(GiB)": 86.38, "step": 1060, "token_acc": 0.8631507279773751, "train_speed(iter/s)": 0.033851 }, { "epoch": 1.3994743758212878, "grad_norm": 0.2858130633831024, "learning_rate": 5.784274499820214e-06, "loss": 0.45337843894958496, "memory(GiB)": 86.38, "step": 1065, "token_acc": 0.8435270132517839, "train_speed(iter/s)": 0.033852 }, { "epoch": 1.4060446780551905, "grad_norm": 0.2949610650539398, "learning_rate": 5.749219994565863e-06, "loss": 0.4539140224456787, "memory(GiB)": 86.38, "step": 1070, "token_acc": 0.8618331826401446, "train_speed(iter/s)": 0.033854 }, { "epoch": 1.4126149802890933, "grad_norm": 0.2909865081310272, "learning_rate": 5.714127776226667e-06, "loss": 0.4557938575744629, "memory(GiB)": 86.38, "step": 1075, "token_acc": 0.8680278588011191, "train_speed(iter/s)": 0.033856 }, { "epoch": 1.419185282522996, "grad_norm": 0.28090617060661316, "learning_rate": 5.6789996112207865e-06, "loss": 0.4519779205322266, "memory(GiB)": 86.38, "step": 1080, "token_acc": 0.8621539840860697, "train_speed(iter/s)": 0.033857 }, { "epoch": 1.4257555847568988, "grad_norm": 0.26703914999961853, "learning_rate": 5.64383726777582e-06, "loss": 0.4575533866882324, "memory(GiB)": 86.38, "step": 1085, "token_acc": 0.8600892222150385, "train_speed(iter/s)": 0.03386 }, { "epoch": 1.4323258869908015, "grad_norm": 0.29428642988204956, "learning_rate": 5.608642515839777e-06, "loss": 0.4562852382659912, "memory(GiB)": 86.38, "step": 1090, "token_acc": 0.8570395907473309, "train_speed(iter/s)": 0.033858 }, { "epoch": 1.4388961892247043, "grad_norm": 0.2922196090221405, "learning_rate": 5.573417126992004e-06, "loss": 0.455198860168457, "memory(GiB)": 86.38, "step": 1095, "token_acc": 0.8534050553582619, "train_speed(iter/s)": 0.033859 }, { "epoch": 1.445466491458607, "grad_norm": 0.2833230793476105, "learning_rate": 5.538162874353994e-06, "loss": 0.45499043464660643, "memory(GiB)": 86.38, "step": 1100, "token_acc": 0.8599968372779505, "train_speed(iter/s)": 0.033861 }, { "epoch": 1.4520367936925098, "grad_norm": 0.30704233050346375, "learning_rate": 5.502881532500149e-06, "loss": 0.4561596870422363, "memory(GiB)": 86.38, "step": 1105, "token_acc": 0.8647945610404966, "train_speed(iter/s)": 0.033863 }, { "epoch": 1.4586070959264126, "grad_norm": 0.2708365321159363, "learning_rate": 5.467574877368441e-06, "loss": 0.45220632553100587, "memory(GiB)": 86.38, "step": 1110, "token_acc": 0.86642938687798, "train_speed(iter/s)": 0.033866 }, { "epoch": 1.4651773981603153, "grad_norm": 0.28449153900146484, "learning_rate": 5.432244686171025e-06, "loss": 0.45653414726257324, "memory(GiB)": 86.38, "step": 1115, "token_acc": 0.8675830627892519, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.471747700394218, "grad_norm": 0.28766512870788574, "learning_rate": 5.396892737304779e-06, "loss": 0.4552262783050537, "memory(GiB)": 86.38, "step": 1120, "token_acc": 0.8638403990024938, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.4783180026281209, "grad_norm": 0.28682559728622437, "learning_rate": 5.361520810261779e-06, "loss": 0.45450830459594727, "memory(GiB)": 86.38, "step": 1125, "token_acc": 0.860114404576183, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.4848883048620236, "grad_norm": 0.30013778805732727, "learning_rate": 5.3261306855397395e-06, "loss": 0.45503602027893064, "memory(GiB)": 86.38, "step": 1130, "token_acc": 0.8707037643207856, "train_speed(iter/s)": 0.033864 }, { "epoch": 1.4914586070959264, "grad_norm": 0.28545552492141724, "learning_rate": 5.290724144552379e-06, "loss": 0.45638151168823243, "memory(GiB)": 86.38, "step": 1135, "token_acc": 0.8654683330992838, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.4980289093298291, "grad_norm": 0.2808593213558197, "learning_rate": 5.255302969539753e-06, "loss": 0.454376745223999, "memory(GiB)": 86.38, "step": 1140, "token_acc": 0.8695363037301251, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.5045992115637319, "grad_norm": 0.30250662565231323, "learning_rate": 5.219868943478542e-06, "loss": 0.45623059272766114, "memory(GiB)": 86.38, "step": 1145, "token_acc": 0.8605342850962578, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.5111695137976346, "grad_norm": 0.296613484621048, "learning_rate": 5.184423849992299e-06, "loss": 0.4548806190490723, "memory(GiB)": 86.38, "step": 1150, "token_acc": 0.8635175178664808, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.5177398160315374, "grad_norm": 0.28246545791625977, "learning_rate": 5.1489694732616805e-06, "loss": 0.4554699420928955, "memory(GiB)": 86.38, "step": 1155, "token_acc": 0.862121567707111, "train_speed(iter/s)": 0.033866 }, { "epoch": 1.5243101182654402, "grad_norm": 0.26761719584465027, "learning_rate": 5.11350759793462e-06, "loss": 0.45384392738342283, "memory(GiB)": 86.38, "step": 1160, "token_acc": 0.8527565417365902, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.530880420499343, "grad_norm": 0.2766062021255493, "learning_rate": 5.078040009036509e-06, "loss": 0.45311508178710935, "memory(GiB)": 86.38, "step": 1165, "token_acc": 0.860136895026955, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.5374507227332457, "grad_norm": 0.2843003571033478, "learning_rate": 5.042568491880338e-06, "loss": 0.455690860748291, "memory(GiB)": 86.38, "step": 1170, "token_acc": 0.8672405980969642, "train_speed(iter/s)": 0.033867 }, { "epoch": 1.5440210249671484, "grad_norm": 0.2944943308830261, "learning_rate": 5.007094831976832e-06, "loss": 0.45423293113708496, "memory(GiB)": 86.38, "step": 1175, "token_acc": 0.865735444638449, "train_speed(iter/s)": 0.033867 }, { "epoch": 1.5505913272010512, "grad_norm": 0.2819548547267914, "learning_rate": 4.9716208149445776e-06, "loss": 0.45132970809936523, "memory(GiB)": 86.38, "step": 1180, "token_acc": 0.8634401381427476, "train_speed(iter/s)": 0.033869 }, { "epoch": 1.557161629434954, "grad_norm": 0.27042356133461, "learning_rate": 4.936148226420133e-06, "loss": 0.45566673278808595, "memory(GiB)": 86.38, "step": 1185, "token_acc": 0.8692132269099202, "train_speed(iter/s)": 0.033865 }, { "epoch": 1.563731931668857, "grad_norm": 0.29058489203453064, "learning_rate": 4.900678851968152e-06, "loss": 0.4520698070526123, "memory(GiB)": 86.38, "step": 1190, "token_acc": 0.8643418665591615, "train_speed(iter/s)": 0.033866 }, { "epoch": 1.5703022339027597, "grad_norm": 0.274539053440094, "learning_rate": 4.865214476991506e-06, "loss": 0.4568329811096191, "memory(GiB)": 86.38, "step": 1195, "token_acc": 0.8561119477911646, "train_speed(iter/s)": 0.033867 }, { "epoch": 1.5768725361366625, "grad_norm": 0.2732899785041809, "learning_rate": 4.829756886641408e-06, "loss": 0.45705676078796387, "memory(GiB)": 86.38, "step": 1200, "token_acc": 0.8784363482569029, "train_speed(iter/s)": 0.033866 }, { "epoch": 1.5834428383705652, "grad_norm": 0.27467477321624756, "learning_rate": 4.794307865727555e-06, "loss": 0.45558509826660154, "memory(GiB)": 86.38, "step": 1205, "token_acc": 0.8533988533988534, "train_speed(iter/s)": 0.033825 }, { "epoch": 1.590013140604468, "grad_norm": 0.2909936308860779, "learning_rate": 4.758869198628296e-06, "loss": 0.45391244888305665, "memory(GiB)": 86.38, "step": 1210, "token_acc": 0.8756224804363292, "train_speed(iter/s)": 0.033824 }, { "epoch": 1.5965834428383707, "grad_norm": 0.2969980835914612, "learning_rate": 4.7234426692007985e-06, "loss": 0.454874324798584, "memory(GiB)": 86.38, "step": 1215, "token_acc": 0.8589074167649206, "train_speed(iter/s)": 0.033825 }, { "epoch": 1.6031537450722735, "grad_norm": 0.2968142032623291, "learning_rate": 4.688030060691264e-06, "loss": 0.4513202667236328, "memory(GiB)": 86.38, "step": 1220, "token_acc": 0.8506660149089575, "train_speed(iter/s)": 0.033824 }, { "epoch": 1.6097240473061762, "grad_norm": 0.28620168566703796, "learning_rate": 4.6526331556451674e-06, "loss": 0.44993081092834475, "memory(GiB)": 86.38, "step": 1225, "token_acc": 0.8493528096896605, "train_speed(iter/s)": 0.033826 }, { "epoch": 1.616294349540079, "grad_norm": 0.2923036515712738, "learning_rate": 4.617253735817522e-06, "loss": 0.4529541492462158, "memory(GiB)": 86.38, "step": 1230, "token_acc": 0.8594011423296601, "train_speed(iter/s)": 0.033823 }, { "epoch": 1.6228646517739818, "grad_norm": 0.29773661494255066, "learning_rate": 4.5818935820832014e-06, "loss": 0.4512050151824951, "memory(GiB)": 86.38, "step": 1235, "token_acc": 0.8610426631879017, "train_speed(iter/s)": 0.033826 }, { "epoch": 1.6294349540078845, "grad_norm": 0.2810444235801697, "learning_rate": 4.546554474347291e-06, "loss": 0.4555663108825684, "memory(GiB)": 86.38, "step": 1240, "token_acc": 0.8596368270149729, "train_speed(iter/s)": 0.033828 }, { "epoch": 1.6360052562417873, "grad_norm": 0.2784985601902008, "learning_rate": 4.511238191455491e-06, "loss": 0.45386524200439454, "memory(GiB)": 86.38, "step": 1245, "token_acc": 0.866062264796442, "train_speed(iter/s)": 0.033827 }, { "epoch": 1.64257555847569, "grad_norm": 0.27828744053840637, "learning_rate": 4.475946511104588e-06, "loss": 0.45246143341064454, "memory(GiB)": 86.38, "step": 1250, "token_acc": 0.8584367661858436, "train_speed(iter/s)": 0.033828 }, { "epoch": 1.6491458607095928, "grad_norm": 0.2854389250278473, "learning_rate": 4.440681209752955e-06, "loss": 0.4526336669921875, "memory(GiB)": 86.38, "step": 1255, "token_acc": 0.851116058685848, "train_speed(iter/s)": 0.033825 }, { "epoch": 1.6557161629434956, "grad_norm": 0.29449641704559326, "learning_rate": 4.405444062531145e-06, "loss": 0.4575493812561035, "memory(GiB)": 86.38, "step": 1260, "token_acc": 0.8626177520332339, "train_speed(iter/s)": 0.033821 }, { "epoch": 1.6622864651773983, "grad_norm": 0.28538015484809875, "learning_rate": 4.37023684315253e-06, "loss": 0.45549468994140624, "memory(GiB)": 86.38, "step": 1265, "token_acc": 0.8691069738087724, "train_speed(iter/s)": 0.03382 }, { "epoch": 1.668856767411301, "grad_norm": 0.27826598286628723, "learning_rate": 4.335061323824019e-06, "loss": 0.44781084060668946, "memory(GiB)": 86.38, "step": 1270, "token_acc": 0.8674536256323777, "train_speed(iter/s)": 0.033821 }, { "epoch": 1.6754270696452038, "grad_norm": 0.2691604495048523, "learning_rate": 4.299919275156857e-06, "loss": 0.4545548439025879, "memory(GiB)": 86.38, "step": 1275, "token_acc": 0.8644137364892598, "train_speed(iter/s)": 0.033824 }, { "epoch": 1.6819973718791066, "grad_norm": 0.27578890323638916, "learning_rate": 4.264812466077486e-06, "loss": 0.4538686752319336, "memory(GiB)": 86.38, "step": 1280, "token_acc": 0.8544989775051125, "train_speed(iter/s)": 0.033826 }, { "epoch": 1.6885676741130093, "grad_norm": 0.2718227803707123, "learning_rate": 4.229742663738521e-06, "loss": 0.4527297496795654, "memory(GiB)": 86.38, "step": 1285, "token_acc": 0.8661887694145759, "train_speed(iter/s)": 0.033825 }, { "epoch": 1.695137976346912, "grad_norm": 0.2723022997379303, "learning_rate": 4.194711633429782e-06, "loss": 0.4542956829071045, "memory(GiB)": 86.38, "step": 1290, "token_acc": 0.8571600048013444, "train_speed(iter/s)": 0.033826 }, { "epoch": 1.7017082785808149, "grad_norm": 0.2890985310077667, "learning_rate": 4.159721138489445e-06, "loss": 0.449599027633667, "memory(GiB)": 86.38, "step": 1295, "token_acc": 0.8626619837713455, "train_speed(iter/s)": 0.033827 }, { "epoch": 1.7082785808147176, "grad_norm": 0.279776394367218, "learning_rate": 4.124772940215279e-06, "loss": 0.4549734115600586, "memory(GiB)": 86.38, "step": 1300, "token_acc": 0.8570111173728162, "train_speed(iter/s)": 0.033827 }, { "epoch": 1.7148488830486204, "grad_norm": 0.2932436168193817, "learning_rate": 4.0898687977759895e-06, "loss": 0.45325145721435545, "memory(GiB)": 86.38, "step": 1305, "token_acc": 0.8666294359547139, "train_speed(iter/s)": 0.033831 }, { "epoch": 1.7214191852825231, "grad_norm": 0.2910197675228119, "learning_rate": 4.0550104681226635e-06, "loss": 0.45451927185058594, "memory(GiB)": 86.38, "step": 1310, "token_acc": 0.8454388043379204, "train_speed(iter/s)": 0.033828 }, { "epoch": 1.727989487516426, "grad_norm": 0.2771059274673462, "learning_rate": 4.020199705900335e-06, "loss": 0.45571699142456057, "memory(GiB)": 86.38, "step": 1315, "token_acc": 0.8857914854356136, "train_speed(iter/s)": 0.033827 }, { "epoch": 1.7345597897503287, "grad_norm": 0.27845674753189087, "learning_rate": 3.985438263359667e-06, "loss": 0.4508528709411621, "memory(GiB)": 86.38, "step": 1320, "token_acc": 0.8715719063545151, "train_speed(iter/s)": 0.033829 }, { "epoch": 1.7411300919842314, "grad_norm": 0.2838834524154663, "learning_rate": 3.950727890268736e-06, "loss": 0.45130367279052735, "memory(GiB)": 86.38, "step": 1325, "token_acc": 0.8547756346523497, "train_speed(iter/s)": 0.033827 }, { "epoch": 1.7477003942181342, "grad_norm": 0.27185139060020447, "learning_rate": 3.91607033382497e-06, "loss": 0.4526374340057373, "memory(GiB)": 86.38, "step": 1330, "token_acc": 0.8633136094674556, "train_speed(iter/s)": 0.033824 }, { "epoch": 1.754270696452037, "grad_norm": 0.28836262226104736, "learning_rate": 3.88146733856719e-06, "loss": 0.4543032646179199, "memory(GiB)": 86.38, "step": 1335, "token_acc": 0.8587059705221084, "train_speed(iter/s)": 0.033827 }, { "epoch": 1.7608409986859397, "grad_norm": 0.27373170852661133, "learning_rate": 3.8469206462878e-06, "loss": 0.4514758586883545, "memory(GiB)": 86.38, "step": 1340, "token_acc": 0.861223101957546, "train_speed(iter/s)": 0.033826 }, { "epoch": 1.7674113009198424, "grad_norm": 0.26478344202041626, "learning_rate": 3.8124319959451133e-06, "loss": 0.45225229263305666, "memory(GiB)": 86.38, "step": 1345, "token_acc": 0.8613606419930531, "train_speed(iter/s)": 0.033822 }, { "epoch": 1.7739816031537452, "grad_norm": 0.31700122356414795, "learning_rate": 3.778003123575815e-06, "loss": 0.45349550247192383, "memory(GiB)": 86.38, "step": 1350, "token_acc": 0.8643513203214696, "train_speed(iter/s)": 0.033818 }, { "epoch": 1.780551905387648, "grad_norm": 0.26822659373283386, "learning_rate": 3.743635762207582e-06, "loss": 0.44829654693603516, "memory(GiB)": 86.38, "step": 1355, "token_acc": 0.8705515383524741, "train_speed(iter/s)": 0.03382 }, { "epoch": 1.7871222076215507, "grad_norm": 0.2593797445297241, "learning_rate": 3.7093316417718407e-06, "loss": 0.45132102966308596, "memory(GiB)": 86.38, "step": 1360, "token_acc": 0.8722857336129338, "train_speed(iter/s)": 0.033822 }, { "epoch": 1.7936925098554535, "grad_norm": 0.2924158275127411, "learning_rate": 3.675092489016693e-06, "loss": 0.4512333869934082, "memory(GiB)": 86.38, "step": 1365, "token_acc": 0.86383098856632, "train_speed(iter/s)": 0.033825 }, { "epoch": 1.8002628120893562, "grad_norm": 0.2746325135231018, "learning_rate": 3.640920027420001e-06, "loss": 0.4558290481567383, "memory(GiB)": 86.38, "step": 1370, "token_acc": 0.8634496357561483, "train_speed(iter/s)": 0.033824 }, { "epoch": 1.806833114323259, "grad_norm": 0.27387329936027527, "learning_rate": 3.6068159771026267e-06, "loss": 0.4523761749267578, "memory(GiB)": 86.38, "step": 1375, "token_acc": 0.8614295741693964, "train_speed(iter/s)": 0.033825 }, { "epoch": 1.8134034165571618, "grad_norm": 0.2677063047885895, "learning_rate": 3.5727820547418525e-06, "loss": 0.4497382640838623, "memory(GiB)": 86.38, "step": 1380, "token_acc": 0.8671490051768228, "train_speed(iter/s)": 0.033826 }, { "epoch": 1.8199737187910645, "grad_norm": 0.26505404710769653, "learning_rate": 3.5388199734849626e-06, "loss": 0.45242948532104493, "memory(GiB)": 86.38, "step": 1385, "token_acc": 0.8609067954770008, "train_speed(iter/s)": 0.033825 }, { "epoch": 1.8265440210249673, "grad_norm": 0.28987395763397217, "learning_rate": 3.504931442863023e-06, "loss": 0.45121097564697266, "memory(GiB)": 86.38, "step": 1390, "token_acc": 0.8593791633359978, "train_speed(iter/s)": 0.033826 }, { "epoch": 1.83311432325887, "grad_norm": 0.2953889071941376, "learning_rate": 3.4711181687048114e-06, "loss": 0.4545147895812988, "memory(GiB)": 86.38, "step": 1395, "token_acc": 0.8520735098537057, "train_speed(iter/s)": 0.033826 }, { "epoch": 1.8396846254927728, "grad_norm": 0.27598556876182556, "learning_rate": 3.4373818530509686e-06, "loss": 0.45116052627563474, "memory(GiB)": 86.38, "step": 1400, "token_acc": 0.866229439933375, "train_speed(iter/s)": 0.033827 }, { "epoch": 1.8462549277266755, "grad_norm": 0.27450037002563477, "learning_rate": 3.40372419406831e-06, "loss": 0.4568813323974609, "memory(GiB)": 86.38, "step": 1405, "token_acc": 0.86804211035818, "train_speed(iter/s)": 0.033826 }, { "epoch": 1.8528252299605783, "grad_norm": 0.2719385027885437, "learning_rate": 3.3701468859643583e-06, "loss": 0.4519033432006836, "memory(GiB)": 86.38, "step": 1410, "token_acc": 0.8648355441589822, "train_speed(iter/s)": 0.033827 }, { "epoch": 1.859395532194481, "grad_norm": 0.2851196825504303, "learning_rate": 3.336651618902054e-06, "loss": 0.4524543762207031, "memory(GiB)": 86.38, "step": 1415, "token_acc": 0.8464139526606158, "train_speed(iter/s)": 0.033828 }, { "epoch": 1.8659658344283838, "grad_norm": 0.2691018879413605, "learning_rate": 3.303240078914679e-06, "loss": 0.45388317108154297, "memory(GiB)": 86.38, "step": 1420, "token_acc": 0.8622832288312715, "train_speed(iter/s)": 0.033828 }, { "epoch": 1.8725361366622866, "grad_norm": 0.2715182900428772, "learning_rate": 3.2699139478209987e-06, "loss": 0.4549809455871582, "memory(GiB)": 86.38, "step": 1425, "token_acc": 0.8636651870640456, "train_speed(iter/s)": 0.03383 }, { "epoch": 1.8791064388961893, "grad_norm": 0.2916743755340576, "learning_rate": 3.2366749031405875e-06, "loss": 0.4505608558654785, "memory(GiB)": 86.38, "step": 1430, "token_acc": 0.8645326192794547, "train_speed(iter/s)": 0.033829 }, { "epoch": 1.885676741130092, "grad_norm": 0.2814328968524933, "learning_rate": 3.203524618009403e-06, "loss": 0.4522216796875, "memory(GiB)": 86.38, "step": 1435, "token_acc": 0.8565567219054724, "train_speed(iter/s)": 0.033831 }, { "epoch": 1.8922470433639949, "grad_norm": 0.29333144426345825, "learning_rate": 3.1704647610955618e-06, "loss": 0.4518414497375488, "memory(GiB)": 86.38, "step": 1440, "token_acc": 0.8547056199821588, "train_speed(iter/s)": 0.033834 }, { "epoch": 1.8988173455978976, "grad_norm": 0.26604127883911133, "learning_rate": 3.137496996515339e-06, "loss": 0.4495247840881348, "memory(GiB)": 86.38, "step": 1445, "token_acc": 0.8561262009251571, "train_speed(iter/s)": 0.033832 }, { "epoch": 1.9053876478318004, "grad_norm": 0.26928678154945374, "learning_rate": 3.1046229837494123e-06, "loss": 0.44922027587890623, "memory(GiB)": 86.38, "step": 1450, "token_acc": 0.8630366102954841, "train_speed(iter/s)": 0.033835 }, { "epoch": 1.9119579500657031, "grad_norm": 0.2921224534511566, "learning_rate": 3.0718443775593233e-06, "loss": 0.44977540969848634, "memory(GiB)": 86.38, "step": 1455, "token_acc": 0.8656272709255467, "train_speed(iter/s)": 0.033835 }, { "epoch": 1.9185282522996059, "grad_norm": 0.2801390290260315, "learning_rate": 3.0391628279041797e-06, "loss": 0.45065975189208984, "memory(GiB)": 86.38, "step": 1460, "token_acc": 0.8713450292397661, "train_speed(iter/s)": 0.033836 }, { "epoch": 1.9250985545335086, "grad_norm": 0.28972676396369934, "learning_rate": 3.0065799798576146e-06, "loss": 0.4490159034729004, "memory(GiB)": 86.38, "step": 1465, "token_acc": 0.8605760938308515, "train_speed(iter/s)": 0.033833 }, { "epoch": 1.9316688567674114, "grad_norm": 0.2788577675819397, "learning_rate": 2.9740974735249627e-06, "loss": 0.45141172409057617, "memory(GiB)": 86.38, "step": 1470, "token_acc": 0.8731429833765947, "train_speed(iter/s)": 0.033833 }, { "epoch": 1.9382391590013142, "grad_norm": 0.27176031470298767, "learning_rate": 2.941716943960716e-06, "loss": 0.4523900508880615, "memory(GiB)": 86.38, "step": 1475, "token_acc": 0.8687188222411486, "train_speed(iter/s)": 0.033835 }, { "epoch": 1.944809461235217, "grad_norm": 0.2714715003967285, "learning_rate": 2.9094400210862206e-06, "loss": 0.4515875816345215, "memory(GiB)": 86.38, "step": 1480, "token_acc": 0.8687481415402915, "train_speed(iter/s)": 0.033837 }, { "epoch": 1.9513797634691197, "grad_norm": 0.272011399269104, "learning_rate": 2.8772683296076197e-06, "loss": 0.44769134521484377, "memory(GiB)": 86.38, "step": 1485, "token_acc": 0.8557253110726099, "train_speed(iter/s)": 0.033839 }, { "epoch": 1.9579500657030224, "grad_norm": 0.2830789089202881, "learning_rate": 2.8452034889340874e-06, "loss": 0.4503666877746582, "memory(GiB)": 86.38, "step": 1490, "token_acc": 0.8650519031141869, "train_speed(iter/s)": 0.033839 }, { "epoch": 1.9645203679369252, "grad_norm": 0.27117088437080383, "learning_rate": 2.8132471130962997e-06, "loss": 0.44952926635742185, "memory(GiB)": 86.38, "step": 1495, "token_acc": 0.8653084323712507, "train_speed(iter/s)": 0.033838 }, { "epoch": 1.971090670170828, "grad_norm": 0.2866286337375641, "learning_rate": 2.781400810665201e-06, "loss": 0.45142645835876466, "memory(GiB)": 86.38, "step": 1500, "token_acc": 0.8606049336804265, "train_speed(iter/s)": 0.033839 }, { "epoch": 1.9776609724047307, "grad_norm": 0.25524598360061646, "learning_rate": 2.749666184671032e-06, "loss": 0.45200319290161134, "memory(GiB)": 86.38, "step": 1505, "token_acc": 0.8672011511974509, "train_speed(iter/s)": 0.033802 }, { "epoch": 1.9842312746386335, "grad_norm": 0.269008070230484, "learning_rate": 2.7180448325226283e-06, "loss": 0.449237060546875, "memory(GiB)": 86.38, "step": 1510, "token_acc": 0.8631796690307328, "train_speed(iter/s)": 0.0338 }, { "epoch": 1.9908015768725362, "grad_norm": 0.2759488821029663, "learning_rate": 2.686538345927027e-06, "loss": 0.454377269744873, "memory(GiB)": 86.38, "step": 1515, "token_acc": 0.8493589743589743, "train_speed(iter/s)": 0.033795 }, { "epoch": 1.997371879106439, "grad_norm": 0.2774396538734436, "learning_rate": 2.6551483108093378e-06, "loss": 0.45154151916503904, "memory(GiB)": 86.38, "step": 1520, "token_acc": 0.854857977170162, "train_speed(iter/s)": 0.033795 }, { "epoch": 2.0039421813403417, "grad_norm": 0.2865091860294342, "learning_rate": 2.623876307232919e-06, "loss": 0.43844971656799314, "memory(GiB)": 86.38, "step": 1525, "token_acc": 0.8616791354945968, "train_speed(iter/s)": 0.0338 }, { "epoch": 2.0105124835742445, "grad_norm": 0.28435423970222473, "learning_rate": 2.5927239093198273e-06, "loss": 0.4346470832824707, "memory(GiB)": 86.38, "step": 1530, "token_acc": 0.8676384460206937, "train_speed(iter/s)": 0.033799 }, { "epoch": 2.0170827858081473, "grad_norm": 0.2833334505558014, "learning_rate": 2.5616926851716055e-06, "loss": 0.43649768829345703, "memory(GiB)": 86.38, "step": 1535, "token_acc": 0.8553893161942894, "train_speed(iter/s)": 0.033798 }, { "epoch": 2.02365308804205, "grad_norm": 0.2661850154399872, "learning_rate": 2.5307841967903337e-06, "loss": 0.4341902732849121, "memory(GiB)": 86.38, "step": 1540, "token_acc": 0.853354760948172, "train_speed(iter/s)": 0.033796 }, { "epoch": 2.0302233902759528, "grad_norm": 0.2832602262496948, "learning_rate": 2.5000000000000015e-06, "loss": 0.4348430633544922, "memory(GiB)": 86.38, "step": 1545, "token_acc": 0.8529144141733126, "train_speed(iter/s)": 0.033797 }, { "epoch": 2.0367936925098555, "grad_norm": 0.26590895652770996, "learning_rate": 2.4693416443682074e-06, "loss": 0.431856632232666, "memory(GiB)": 86.38, "step": 1550, "token_acc": 0.8682563338301044, "train_speed(iter/s)": 0.033797 }, { "epoch": 2.0433639947437583, "grad_norm": 0.28006982803344727, "learning_rate": 2.4388106731281496e-06, "loss": 0.43282361030578614, "memory(GiB)": 86.38, "step": 1555, "token_acc": 0.869759845139435, "train_speed(iter/s)": 0.0338 }, { "epoch": 2.049934296977661, "grad_norm": 0.2961016893386841, "learning_rate": 2.40840862310094e-06, "loss": 0.43299617767333987, "memory(GiB)": 86.38, "step": 1560, "token_acc": 0.8845442367799962, "train_speed(iter/s)": 0.033797 }, { "epoch": 2.056504599211564, "grad_norm": 0.2669562101364136, "learning_rate": 2.378137024618262e-06, "loss": 0.4347973823547363, "memory(GiB)": 86.38, "step": 1565, "token_acc": 0.8502078945947406, "train_speed(iter/s)": 0.033798 }, { "epoch": 2.0630749014454666, "grad_norm": 0.2754296362400055, "learning_rate": 2.3479974014453255e-06, "loss": 0.43701701164245604, "memory(GiB)": 86.38, "step": 1570, "token_acc": 0.8600905562742561, "train_speed(iter/s)": 0.033799 }, { "epoch": 2.0696452036793693, "grad_norm": 0.2642713189125061, "learning_rate": 2.317991270704167e-06, "loss": 0.43048667907714844, "memory(GiB)": 86.38, "step": 1575, "token_acc": 0.8709290926914279, "train_speed(iter/s)": 0.033802 }, { "epoch": 2.076215505913272, "grad_norm": 0.2664032280445099, "learning_rate": 2.2881201427972894e-06, "loss": 0.43495759963989256, "memory(GiB)": 86.38, "step": 1580, "token_acc": 0.8594156340829127, "train_speed(iter/s)": 0.033803 }, { "epoch": 2.082785808147175, "grad_norm": 0.27893051505088806, "learning_rate": 2.2583855213316326e-06, "loss": 0.4322032928466797, "memory(GiB)": 86.38, "step": 1585, "token_acc": 0.8674502122102514, "train_speed(iter/s)": 0.0338 }, { "epoch": 2.0893561103810776, "grad_norm": 0.25695356726646423, "learning_rate": 2.228788903042877e-06, "loss": 0.4315330505371094, "memory(GiB)": 86.38, "step": 1590, "token_acc": 0.8767547253233116, "train_speed(iter/s)": 0.033798 }, { "epoch": 2.0959264126149804, "grad_norm": 0.2659642696380615, "learning_rate": 2.1993317777201197e-06, "loss": 0.43229498863220217, "memory(GiB)": 86.38, "step": 1595, "token_acc": 0.8707460370247201, "train_speed(iter/s)": 0.033796 }, { "epoch": 2.102496714848883, "grad_norm": 0.2697013020515442, "learning_rate": 2.170015628130871e-06, "loss": 0.4357916355133057, "memory(GiB)": 86.38, "step": 1600, "token_acc": 0.8637946662850055, "train_speed(iter/s)": 0.033796 }, { "epoch": 2.109067017082786, "grad_norm": 0.27165451645851135, "learning_rate": 2.1408419299464245e-06, "loss": 0.4324627876281738, "memory(GiB)": 86.38, "step": 1605, "token_acc": 0.8698603817087229, "train_speed(iter/s)": 0.033795 }, { "epoch": 2.1156373193166886, "grad_norm": 0.2767409384250641, "learning_rate": 2.111812151667567e-06, "loss": 0.433492374420166, "memory(GiB)": 86.38, "step": 1610, "token_acc": 0.8622224420157262, "train_speed(iter/s)": 0.033795 }, { "epoch": 2.1222076215505914, "grad_norm": 0.2886437177658081, "learning_rate": 2.0829277545506736e-06, "loss": 0.4330601692199707, "memory(GiB)": 86.38, "step": 1615, "token_acc": 0.87356944538498, "train_speed(iter/s)": 0.033795 }, { "epoch": 2.128777923784494, "grad_norm": 0.27543848752975464, "learning_rate": 2.0541901925341446e-06, "loss": 0.4322654724121094, "memory(GiB)": 86.38, "step": 1620, "token_acc": 0.8578295433536698, "train_speed(iter/s)": 0.033796 }, { "epoch": 2.135348226018397, "grad_norm": 0.2620643675327301, "learning_rate": 2.0256009121652147e-06, "loss": 0.43578500747680665, "memory(GiB)": 86.38, "step": 1625, "token_acc": 0.868349382355802, "train_speed(iter/s)": 0.033789 }, { "epoch": 2.1419185282522997, "grad_norm": 0.28385990858078003, "learning_rate": 1.9971613525271523e-06, "loss": 0.43427586555480957, "memory(GiB)": 86.38, "step": 1630, "token_acc": 0.8664960419022677, "train_speed(iter/s)": 0.033789 }, { "epoch": 2.1484888304862024, "grad_norm": 0.2743207514286041, "learning_rate": 1.9688729451668116e-06, "loss": 0.43171100616455077, "memory(GiB)": 86.38, "step": 1635, "token_acc": 0.8658852104123765, "train_speed(iter/s)": 0.033789 }, { "epoch": 2.155059132720105, "grad_norm": 0.27282217144966125, "learning_rate": 1.940737114022572e-06, "loss": 0.43387999534606936, "memory(GiB)": 86.38, "step": 1640, "token_acc": 0.8552638446683021, "train_speed(iter/s)": 0.033785 }, { "epoch": 2.161629434954008, "grad_norm": 0.26848945021629333, "learning_rate": 1.9127552753526683e-06, "loss": 0.4308422565460205, "memory(GiB)": 86.38, "step": 1645, "token_acc": 0.8723747980613893, "train_speed(iter/s)": 0.033783 }, { "epoch": 2.1681997371879107, "grad_norm": 0.2596457600593567, "learning_rate": 1.884928837663902e-06, "loss": 0.4331303596496582, "memory(GiB)": 86.38, "step": 1650, "token_acc": 0.8620848945234307, "train_speed(iter/s)": 0.033785 }, { "epoch": 2.1747700394218135, "grad_norm": 0.2749711871147156, "learning_rate": 1.8572592016407337e-06, "loss": 0.4339931488037109, "memory(GiB)": 86.38, "step": 1655, "token_acc": 0.8706686188384578, "train_speed(iter/s)": 0.033784 }, { "epoch": 2.181340341655716, "grad_norm": 0.26862356066703796, "learning_rate": 1.8297477600747854e-06, "loss": 0.43131422996520996, "memory(GiB)": 86.38, "step": 1660, "token_acc": 0.8703601718250908, "train_speed(iter/s)": 0.033783 }, { "epoch": 2.187910643889619, "grad_norm": 0.28293994069099426, "learning_rate": 1.8023958977947303e-06, "loss": 0.4327284812927246, "memory(GiB)": 86.38, "step": 1665, "token_acc": 0.8674884437596302, "train_speed(iter/s)": 0.033781 }, { "epoch": 2.1944809461235217, "grad_norm": 0.2755849063396454, "learning_rate": 1.7752049915965807e-06, "loss": 0.43210086822509763, "memory(GiB)": 86.38, "step": 1670, "token_acc": 0.8653022928516977, "train_speed(iter/s)": 0.033781 }, { "epoch": 2.2010512483574245, "grad_norm": 0.2687658965587616, "learning_rate": 1.7481764101743925e-06, "loss": 0.4309385776519775, "memory(GiB)": 86.38, "step": 1675, "token_acc": 0.8708192896033187, "train_speed(iter/s)": 0.033778 }, { "epoch": 2.2076215505913273, "grad_norm": 0.2643987536430359, "learning_rate": 1.7213115140513687e-06, "loss": 0.43217859268188474, "memory(GiB)": 86.38, "step": 1680, "token_acc": 0.8690569923081582, "train_speed(iter/s)": 0.033776 }, { "epoch": 2.21419185282523, "grad_norm": 0.27602747082710266, "learning_rate": 1.694611655511365e-06, "loss": 0.42904300689697267, "memory(GiB)": 86.38, "step": 1685, "token_acc": 0.8896275737429807, "train_speed(iter/s)": 0.033776 }, { "epoch": 2.2207621550591328, "grad_norm": 0.25782617926597595, "learning_rate": 1.668078178530837e-06, "loss": 0.4349325180053711, "memory(GiB)": 86.38, "step": 1690, "token_acc": 0.8658529694298469, "train_speed(iter/s)": 0.033775 }, { "epoch": 2.2273324572930355, "grad_norm": 0.26953521370887756, "learning_rate": 1.6417124187111778e-06, "loss": 0.4276991844177246, "memory(GiB)": 86.38, "step": 1695, "token_acc": 0.8727225739759659, "train_speed(iter/s)": 0.033775 }, { "epoch": 2.2339027595269383, "grad_norm": 0.2712646424770355, "learning_rate": 1.6155157032114926e-06, "loss": 0.4300542831420898, "memory(GiB)": 86.38, "step": 1700, "token_acc": 0.8694365753855838, "train_speed(iter/s)": 0.033776 }, { "epoch": 2.240473061760841, "grad_norm": 0.28259536623954773, "learning_rate": 1.589489350681791e-06, "loss": 0.43476276397705077, "memory(GiB)": 86.38, "step": 1705, "token_acc": 0.8633074766964344, "train_speed(iter/s)": 0.033776 }, { "epoch": 2.247043363994744, "grad_norm": 0.2692559063434601, "learning_rate": 1.5636346711966154e-06, "loss": 0.4304978847503662, "memory(GiB)": 86.38, "step": 1710, "token_acc": 0.8691604140423901, "train_speed(iter/s)": 0.033776 }, { "epoch": 2.2536136662286466, "grad_norm": 0.26556524634361267, "learning_rate": 1.5379529661890956e-06, "loss": 0.4372213363647461, "memory(GiB)": 86.38, "step": 1715, "token_acc": 0.8606243830207305, "train_speed(iter/s)": 0.033778 }, { "epoch": 2.2601839684625493, "grad_norm": 0.26940152049064636, "learning_rate": 1.512445528385434e-06, "loss": 0.4369645118713379, "memory(GiB)": 86.38, "step": 1720, "token_acc": 0.857104328673529, "train_speed(iter/s)": 0.033779 }, { "epoch": 2.266754270696452, "grad_norm": 0.2632419466972351, "learning_rate": 1.4871136417398407e-06, "loss": 0.43130922317504883, "memory(GiB)": 86.38, "step": 1725, "token_acc": 0.8684261345349211, "train_speed(iter/s)": 0.033776 }, { "epoch": 2.273324572930355, "grad_norm": 0.27120915055274963, "learning_rate": 1.4619585813699032e-06, "loss": 0.436324405670166, "memory(GiB)": 86.38, "step": 1730, "token_acc": 0.8729593158849442, "train_speed(iter/s)": 0.033776 }, { "epoch": 2.2798948751642576, "grad_norm": 0.28977081179618835, "learning_rate": 1.436981613492394e-06, "loss": 0.434481143951416, "memory(GiB)": 86.38, "step": 1735, "token_acc": 0.863697705802969, "train_speed(iter/s)": 0.033771 }, { "epoch": 2.2864651773981604, "grad_norm": 0.27072688937187195, "learning_rate": 1.412183995359544e-06, "loss": 0.43726301193237305, "memory(GiB)": 86.38, "step": 1740, "token_acc": 0.8646803900325027, "train_speed(iter/s)": 0.033769 }, { "epoch": 2.293035479632063, "grad_norm": 0.2683422863483429, "learning_rate": 1.3875669751957548e-06, "loss": 0.4344059467315674, "memory(GiB)": 86.38, "step": 1745, "token_acc": 0.8645030938249779, "train_speed(iter/s)": 0.03377 }, { "epoch": 2.299605781865966, "grad_norm": 0.2548208236694336, "learning_rate": 1.3631317921347564e-06, "loss": 0.4341590881347656, "memory(GiB)": 86.38, "step": 1750, "token_acc": 0.8695078031212485, "train_speed(iter/s)": 0.033772 }, { "epoch": 2.3061760840998686, "grad_norm": 0.25699329376220703, "learning_rate": 1.3388796761572493e-06, "loss": 0.43475918769836425, "memory(GiB)": 86.38, "step": 1755, "token_acc": 0.8668202539091221, "train_speed(iter/s)": 0.033773 }, { "epoch": 2.3127463863337714, "grad_norm": 0.284801721572876, "learning_rate": 1.3148118480289834e-06, "loss": 0.43476195335388185, "memory(GiB)": 86.38, "step": 1760, "token_acc": 0.872836719337848, "train_speed(iter/s)": 0.033772 }, { "epoch": 2.319316688567674, "grad_norm": 0.2635682225227356, "learning_rate": 1.2909295192393057e-06, "loss": 0.4339436531066895, "memory(GiB)": 86.38, "step": 1765, "token_acc": 0.8698166676305592, "train_speed(iter/s)": 0.033774 }, { "epoch": 2.325886990801577, "grad_norm": 0.25871872901916504, "learning_rate": 1.2672338919401866e-06, "loss": 0.4373739719390869, "memory(GiB)": 86.38, "step": 1770, "token_acc": 0.8606108452163616, "train_speed(iter/s)": 0.033775 }, { "epoch": 2.3324572930354797, "grad_norm": 0.26007142663002014, "learning_rate": 1.2437261588857037e-06, "loss": 0.432224702835083, "memory(GiB)": 86.38, "step": 1775, "token_acc": 0.8673443326352352, "train_speed(iter/s)": 0.033776 }, { "epoch": 2.3390275952693824, "grad_norm": 0.26318100094795227, "learning_rate": 1.2204075033720025e-06, "loss": 0.4342185020446777, "memory(GiB)": 86.38, "step": 1780, "token_acc": 0.8722417109878918, "train_speed(iter/s)": 0.033777 }, { "epoch": 2.345597897503285, "grad_norm": 0.25941622257232666, "learning_rate": 1.197279099177731e-06, "loss": 0.43193416595458983, "memory(GiB)": 86.38, "step": 1785, "token_acc": 0.8598272926295305, "train_speed(iter/s)": 0.033777 }, { "epoch": 2.352168199737188, "grad_norm": 0.2658545970916748, "learning_rate": 1.1743421105049612e-06, "loss": 0.432745361328125, "memory(GiB)": 86.38, "step": 1790, "token_acc": 0.8685264027451229, "train_speed(iter/s)": 0.033774 }, { "epoch": 2.3587385019710907, "grad_norm": 0.2550273537635803, "learning_rate": 1.1515976919205869e-06, "loss": 0.43065509796142576, "memory(GiB)": 86.38, "step": 1795, "token_acc": 0.8694100591056094, "train_speed(iter/s)": 0.033774 }, { "epoch": 2.3653088042049935, "grad_norm": 0.27043265104293823, "learning_rate": 1.1290469882981987e-06, "loss": 0.4335516929626465, "memory(GiB)": 86.38, "step": 1800, "token_acc": 0.8756407695892418, "train_speed(iter/s)": 0.033772 }, { "epoch": 2.371879106438896, "grad_norm": 0.2714201509952545, "learning_rate": 1.1066911347604653e-06, "loss": 0.43355650901794435, "memory(GiB)": 86.38, "step": 1805, "token_acc": 0.8681369627127624, "train_speed(iter/s)": 0.033741 }, { "epoch": 2.378449408672799, "grad_norm": 0.2614336311817169, "learning_rate": 1.0845312566219924e-06, "loss": 0.4319025993347168, "memory(GiB)": 86.38, "step": 1810, "token_acc": 0.869137266528313, "train_speed(iter/s)": 0.033739 }, { "epoch": 2.3850197109067017, "grad_norm": 0.258635014295578, "learning_rate": 1.0625684693326727e-06, "loss": 0.4368411064147949, "memory(GiB)": 86.38, "step": 1815, "token_acc": 0.8626625620405856, "train_speed(iter/s)": 0.033739 }, { "epoch": 2.3915900131406045, "grad_norm": 0.2613593637943268, "learning_rate": 1.0408038784215462e-06, "loss": 0.43021059036254883, "memory(GiB)": 86.38, "step": 1820, "token_acc": 0.8690397350993377, "train_speed(iter/s)": 0.033738 }, { "epoch": 2.3981603153745072, "grad_norm": 0.2565341889858246, "learning_rate": 1.019238579441148e-06, "loss": 0.43543272018432616, "memory(GiB)": 86.38, "step": 1825, "token_acc": 0.8536611843890789, "train_speed(iter/s)": 0.033739 }, { "epoch": 2.40473061760841, "grad_norm": 0.27332931756973267, "learning_rate": 9.978736579123577e-07, "loss": 0.43721885681152345, "memory(GiB)": 86.38, "step": 1830, "token_acc": 0.8662766830870279, "train_speed(iter/s)": 0.03374 }, { "epoch": 2.4113009198423128, "grad_norm": 0.25557610392570496, "learning_rate": 9.7671018926977e-07, "loss": 0.4312717914581299, "memory(GiB)": 86.38, "step": 1835, "token_acc": 0.8687478440841669, "train_speed(iter/s)": 0.033742 }, { "epoch": 2.4178712220762155, "grad_norm": 0.2686271071434021, "learning_rate": 9.5574923880755e-07, "loss": 0.43270196914672854, "memory(GiB)": 86.38, "step": 1840, "token_acc": 0.8766914011348756, "train_speed(iter/s)": 0.033742 }, { "epoch": 2.4244415243101183, "grad_norm": 0.27892932295799255, "learning_rate": 9.349918616258113e-07, "loss": 0.43126745223999025, "memory(GiB)": 86.38, "step": 1845, "token_acc": 0.8783595334685599, "train_speed(iter/s)": 0.033743 }, { "epoch": 2.431011826544021, "grad_norm": 0.258810430765152, "learning_rate": 9.144391025775123e-07, "loss": 0.4329942226409912, "memory(GiB)": 86.38, "step": 1850, "token_acc": 0.8575249047268837, "train_speed(iter/s)": 0.033743 }, { "epoch": 2.437582128777924, "grad_norm": 0.2641558051109314, "learning_rate": 8.940919962158584e-07, "loss": 0.4300084114074707, "memory(GiB)": 86.38, "step": 1855, "token_acc": 0.8717879906071788, "train_speed(iter/s)": 0.033743 }, { "epoch": 2.4441524310118266, "grad_norm": 0.27095404267311096, "learning_rate": 8.739515667422211e-07, "loss": 0.42922472953796387, "memory(GiB)": 86.38, "step": 1860, "token_acc": 0.8675078864353313, "train_speed(iter/s)": 0.033745 }, { "epoch": 2.4507227332457293, "grad_norm": 0.25571873784065247, "learning_rate": 8.540188279545942e-07, "loss": 0.4320818901062012, "memory(GiB)": 86.38, "step": 1865, "token_acc": 0.8654411764705883, "train_speed(iter/s)": 0.033748 }, { "epoch": 2.457293035479632, "grad_norm": 0.260748952627182, "learning_rate": 8.342947831965537e-07, "loss": 0.4332849979400635, "memory(GiB)": 86.38, "step": 1870, "token_acc": 0.8596434777012678, "train_speed(iter/s)": 0.033746 }, { "epoch": 2.463863337713535, "grad_norm": 0.26162466406822205, "learning_rate": 8.147804253067581e-07, "loss": 0.4324943065643311, "memory(GiB)": 86.38, "step": 1875, "token_acc": 0.8610932130584192, "train_speed(iter/s)": 0.033746 }, { "epoch": 2.4704336399474376, "grad_norm": 0.2608964443206787, "learning_rate": 7.954767365689675e-07, "loss": 0.43703885078430177, "memory(GiB)": 86.38, "step": 1880, "token_acc": 0.8689590565933764, "train_speed(iter/s)": 0.033743 }, { "epoch": 2.4770039421813403, "grad_norm": 0.2531532943248749, "learning_rate": 7.763846886626048e-07, "loss": 0.4334650993347168, "memory(GiB)": 86.38, "step": 1885, "token_acc": 0.8626056024899955, "train_speed(iter/s)": 0.033744 }, { "epoch": 2.483574244415243, "grad_norm": 0.25901561975479126, "learning_rate": 7.575052426138424e-07, "loss": 0.43249049186706545, "memory(GiB)": 86.38, "step": 1890, "token_acc": 0.8670503026093859, "train_speed(iter/s)": 0.033743 }, { "epoch": 2.490144546649146, "grad_norm": 0.2607087194919586, "learning_rate": 7.388393487472223e-07, "loss": 0.4332951545715332, "memory(GiB)": 86.38, "step": 1895, "token_acc": 0.8609271523178808, "train_speed(iter/s)": 0.033744 }, { "epoch": 2.4967148488830486, "grad_norm": 0.24934022128582, "learning_rate": 7.203879466378311e-07, "loss": 0.43254899978637695, "memory(GiB)": 86.38, "step": 1900, "token_acc": 0.8726016035388443, "train_speed(iter/s)": 0.033742 }, { "epoch": 2.5032851511169514, "grad_norm": 0.274565726518631, "learning_rate": 7.021519650639952e-07, "loss": 0.42643136978149415, "memory(GiB)": 86.38, "step": 1905, "token_acc": 0.8752810932364643, "train_speed(iter/s)": 0.033745 }, { "epoch": 2.509855453350854, "grad_norm": 0.2578328847885132, "learning_rate": 6.841323219605333e-07, "loss": 0.43291406631469725, "memory(GiB)": 86.38, "step": 1910, "token_acc": 0.8636747967479674, "train_speed(iter/s)": 0.033744 }, { "epoch": 2.516425755584757, "grad_norm": 0.248977929353714, "learning_rate": 6.663299243725512e-07, "loss": 0.42647299766540525, "memory(GiB)": 86.38, "step": 1915, "token_acc": 0.8544173576906291, "train_speed(iter/s)": 0.033743 }, { "epoch": 2.5229960578186597, "grad_norm": 0.2570980191230774, "learning_rate": 6.487456684097848e-07, "loss": 0.43337106704711914, "memory(GiB)": 86.38, "step": 1920, "token_acc": 0.8634151992585728, "train_speed(iter/s)": 0.033741 }, { "epoch": 2.5295663600525624, "grad_norm": 0.2565690875053406, "learning_rate": 6.313804392014905e-07, "loss": 0.4316126823425293, "memory(GiB)": 86.38, "step": 1925, "token_acc": 0.8810017459624618, "train_speed(iter/s)": 0.033741 }, { "epoch": 2.536136662286465, "grad_norm": 0.2750677168369293, "learning_rate": 6.142351108518929e-07, "loss": 0.4336524963378906, "memory(GiB)": 86.38, "step": 1930, "token_acc": 0.8709823449524672, "train_speed(iter/s)": 0.03374 }, { "epoch": 2.542706964520368, "grad_norm": 0.2661495506763458, "learning_rate": 5.973105463961864e-07, "loss": 0.43224172592163085, "memory(GiB)": 86.38, "step": 1935, "token_acc": 0.8723215768783567, "train_speed(iter/s)": 0.033739 }, { "epoch": 2.5492772667542707, "grad_norm": 0.2650693655014038, "learning_rate": 5.806075977570886e-07, "loss": 0.43565120697021487, "memory(GiB)": 86.38, "step": 1940, "token_acc": 0.868112798264642, "train_speed(iter/s)": 0.033737 }, { "epoch": 2.5558475689881734, "grad_norm": 0.26757803559303284, "learning_rate": 5.641271057019637e-07, "loss": 0.4298720359802246, "memory(GiB)": 86.38, "step": 1945, "token_acc": 0.8649701539428213, "train_speed(iter/s)": 0.03374 }, { "epoch": 2.562417871222076, "grad_norm": 0.2645432949066162, "learning_rate": 5.478698998004967e-07, "loss": 0.4320925235748291, "memory(GiB)": 86.38, "step": 1950, "token_acc": 0.8747405689171042, "train_speed(iter/s)": 0.033741 }, { "epoch": 2.568988173455979, "grad_norm": 0.2562493085861206, "learning_rate": 5.318367983829393e-07, "loss": 0.43427433967590334, "memory(GiB)": 86.38, "step": 1955, "token_acc": 0.8623294224281183, "train_speed(iter/s)": 0.033742 }, { "epoch": 2.5755584756898817, "grad_norm": 0.2527105212211609, "learning_rate": 5.160286084989119e-07, "loss": 0.4341059684753418, "memory(GiB)": 86.38, "step": 1960, "token_acc": 0.8693252448908557, "train_speed(iter/s)": 0.033744 }, { "epoch": 2.5821287779237845, "grad_norm": 0.2574499249458313, "learning_rate": 5.004461258767873e-07, "loss": 0.43187813758850097, "memory(GiB)": 86.38, "step": 1965, "token_acc": 0.866506053867062, "train_speed(iter/s)": 0.033744 }, { "epoch": 2.5886990801576872, "grad_norm": 0.2509396970272064, "learning_rate": 4.850901348836328e-07, "loss": 0.4363058090209961, "memory(GiB)": 86.38, "step": 1970, "token_acc": 0.8651067174557108, "train_speed(iter/s)": 0.033744 }, { "epoch": 2.59526938239159, "grad_norm": 0.25620976090431213, "learning_rate": 4.699614084857257e-07, "loss": 0.43309574127197265, "memory(GiB)": 86.38, "step": 1975, "token_acc": 0.8586676260718354, "train_speed(iter/s)": 0.033748 }, { "epoch": 2.6018396846254928, "grad_norm": 0.26240846514701843, "learning_rate": 4.5506070820964973e-07, "loss": 0.4343746185302734, "memory(GiB)": 86.38, "step": 1980, "token_acc": 0.8798804986092511, "train_speed(iter/s)": 0.033746 }, { "epoch": 2.6084099868593955, "grad_norm": 0.2610469460487366, "learning_rate": 4.4038878410396003e-07, "loss": 0.43410425186157225, "memory(GiB)": 86.38, "step": 1985, "token_acc": 0.8795611253711813, "train_speed(iter/s)": 0.033747 }, { "epoch": 2.6149802890932983, "grad_norm": 0.26572689414024353, "learning_rate": 4.2594637470142587e-07, "loss": 0.4306765556335449, "memory(GiB)": 86.38, "step": 1990, "token_acc": 0.8712702886577899, "train_speed(iter/s)": 0.033746 }, { "epoch": 2.621550591327201, "grad_norm": 0.2641327381134033, "learning_rate": 4.1173420698186027e-07, "loss": 0.4300968647003174, "memory(GiB)": 86.38, "step": 1995, "token_acc": 0.8624918094168305, "train_speed(iter/s)": 0.033745 }, { "epoch": 2.628120893561104, "grad_norm": 0.2534749507904053, "learning_rate": 3.9775299633552535e-07, "loss": 0.43284106254577637, "memory(GiB)": 86.38, "step": 2000, "token_acc": 0.8643312431984246, "train_speed(iter/s)": 0.033744 }, { "epoch": 2.6346911957950065, "grad_norm": 0.2566858232021332, "learning_rate": 3.840034465271164e-07, "loss": 0.4347895622253418, "memory(GiB)": 86.38, "step": 2005, "token_acc": 0.8744274109814939, "train_speed(iter/s)": 0.033742 }, { "epoch": 2.6412614980289093, "grad_norm": 0.2541216313838959, "learning_rate": 3.7048624966034506e-07, "loss": 0.4313460350036621, "memory(GiB)": 86.38, "step": 2010, "token_acc": 0.8607475533545572, "train_speed(iter/s)": 0.033742 }, { "epoch": 2.647831800262812, "grad_norm": 0.2455441802740097, "learning_rate": 3.572020861430997e-07, "loss": 0.429301118850708, "memory(GiB)": 86.38, "step": 2015, "token_acc": 0.8699587080717235, "train_speed(iter/s)": 0.033745 }, { "epoch": 2.654402102496715, "grad_norm": 0.2605392336845398, "learning_rate": 3.4415162465318843e-07, "loss": 0.43214893341064453, "memory(GiB)": 86.38, "step": 2020, "token_acc": 0.8749286122215877, "train_speed(iter/s)": 0.033746 }, { "epoch": 2.6609724047306176, "grad_norm": 0.25466424226760864, "learning_rate": 3.313355221046888e-07, "loss": 0.4351536273956299, "memory(GiB)": 86.38, "step": 2025, "token_acc": 0.8606416722999324, "train_speed(iter/s)": 0.033747 }, { "epoch": 2.6675427069645203, "grad_norm": 0.26080095767974854, "learning_rate": 3.1875442361487987e-07, "loss": 0.43200006484985354, "memory(GiB)": 86.38, "step": 2030, "token_acc": 0.8577261487147047, "train_speed(iter/s)": 0.033746 }, { "epoch": 2.674113009198423, "grad_norm": 0.2586158215999603, "learning_rate": 3.0640896247176257e-07, "loss": 0.4336066246032715, "memory(GiB)": 86.38, "step": 2035, "token_acc": 0.8641052229438411, "train_speed(iter/s)": 0.033748 }, { "epoch": 2.680683311432326, "grad_norm": 0.25608712434768677, "learning_rate": 2.942997601021924e-07, "loss": 0.431638240814209, "memory(GiB)": 86.38, "step": 2040, "token_acc": 0.8686192034065534, "train_speed(iter/s)": 0.033749 }, { "epoch": 2.6872536136662286, "grad_norm": 0.2741917669773102, "learning_rate": 2.824274260405896e-07, "loss": 0.43211984634399414, "memory(GiB)": 86.38, "step": 2045, "token_acc": 0.8671380975045897, "train_speed(iter/s)": 0.03375 }, { "epoch": 2.6938239159001314, "grad_norm": 0.25155529379844666, "learning_rate": 2.7079255789826565e-07, "loss": 0.4306828022003174, "memory(GiB)": 86.38, "step": 2050, "token_acc": 0.8573598004121922, "train_speed(iter/s)": 0.03375 }, { "epoch": 2.700394218134034, "grad_norm": 0.2564401924610138, "learning_rate": 2.593957413333331e-07, "loss": 0.435395622253418, "memory(GiB)": 86.38, "step": 2055, "token_acc": 0.8754750443374715, "train_speed(iter/s)": 0.033751 }, { "epoch": 2.706964520367937, "grad_norm": 0.26199498772621155, "learning_rate": 2.4823755002123253e-07, "loss": 0.43353948593139646, "memory(GiB)": 86.38, "step": 2060, "token_acc": 0.864589503613316, "train_speed(iter/s)": 0.033752 }, { "epoch": 2.7135348226018396, "grad_norm": 0.25014081597328186, "learning_rate": 2.373185456258531e-07, "loss": 0.43132529258728025, "memory(GiB)": 86.38, "step": 2065, "token_acc": 0.8730107001249253, "train_speed(iter/s)": 0.033753 }, { "epoch": 2.7201051248357424, "grad_norm": 0.26392775774002075, "learning_rate": 2.266392777712595e-07, "loss": 0.4323751926422119, "memory(GiB)": 86.38, "step": 2070, "token_acc": 0.8591085068536152, "train_speed(iter/s)": 0.033755 }, { "epoch": 2.726675427069645, "grad_norm": 0.2512003779411316, "learning_rate": 2.1620028401402815e-07, "loss": 0.42936067581176757, "memory(GiB)": 86.38, "step": 2075, "token_acc": 0.8530398736675878, "train_speed(iter/s)": 0.033755 }, { "epoch": 2.733245729303548, "grad_norm": 0.2585814893245697, "learning_rate": 2.060020898161863e-07, "loss": 0.4324427604675293, "memory(GiB)": 86.38, "step": 2080, "token_acc": 0.8682577296321372, "train_speed(iter/s)": 0.033755 }, { "epoch": 2.7398160315374507, "grad_norm": 0.2547103464603424, "learning_rate": 1.9604520851876196e-07, "loss": 0.42908296585083006, "memory(GiB)": 86.38, "step": 2085, "token_acc": 0.876189898744922, "train_speed(iter/s)": 0.033753 }, { "epoch": 2.7463863337713534, "grad_norm": 0.25863251090049744, "learning_rate": 1.863301413159474e-07, "loss": 0.43100652694702146, "memory(GiB)": 86.38, "step": 2090, "token_acc": 0.8747226144845672, "train_speed(iter/s)": 0.033756 }, { "epoch": 2.752956636005256, "grad_norm": 0.2541860044002533, "learning_rate": 1.768573772298665e-07, "loss": 0.43143587112426757, "memory(GiB)": 86.38, "step": 2095, "token_acc": 0.8809481163054511, "train_speed(iter/s)": 0.033757 }, { "epoch": 2.759526938239159, "grad_norm": 0.24803873896598816, "learning_rate": 1.6762739308596343e-07, "loss": 0.4299370765686035, "memory(GiB)": 86.38, "step": 2100, "token_acc": 0.8700363353231249, "train_speed(iter/s)": 0.033756 } ], "logging_steps": 5, "max_steps": 2283, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4827819580022116e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }