diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4382 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0064516129032258064, + "grad_norm": 2.3284332752227783, + "learning_rate": 4.032258064516129e-07, + "loss": 0.1831, + "step": 1 + }, + { + "epoch": 0.012903225806451613, + "grad_norm": 3.1032278537750244, + "learning_rate": 8.064516129032258e-07, + "loss": 0.2496, + "step": 2 + }, + { + "epoch": 0.01935483870967742, + "grad_norm": 2.7308666706085205, + "learning_rate": 1.2096774193548388e-06, + "loss": 0.2497, + "step": 3 + }, + { + "epoch": 0.025806451612903226, + "grad_norm": 2.6942598819732666, + "learning_rate": 1.6129032258064516e-06, + "loss": 0.2513, + "step": 4 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 2.281903028488159, + "learning_rate": 2.0161290322580646e-06, + "loss": 0.2021, + "step": 5 + }, + { + "epoch": 0.03870967741935484, + "grad_norm": 2.2116780281066895, + "learning_rate": 2.4193548387096776e-06, + "loss": 0.2472, + "step": 6 + }, + { + "epoch": 0.04516129032258064, + "grad_norm": 2.3709909915924072, + "learning_rate": 2.82258064516129e-06, + "loss": 0.2133, + "step": 7 + }, + { + "epoch": 0.05161290322580645, + "grad_norm": 1.6222234964370728, + "learning_rate": 3.225806451612903e-06, + "loss": 0.2072, + "step": 8 + }, + { + "epoch": 0.05806451612903226, + "grad_norm": 1.6226286888122559, + "learning_rate": 3.6290322580645166e-06, + "loss": 0.1732, + "step": 9 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 1.514697551727295, + "learning_rate": 4.032258064516129e-06, + "loss": 0.2088, + "step": 10 + }, + { + "epoch": 0.07096774193548387, + "grad_norm": 1.6407102346420288, + "learning_rate": 4.435483870967742e-06, + "loss": 0.1733, + "step": 11 + }, + { + "epoch": 0.07741935483870968, + "grad_norm": 1.5498034954071045, + "learning_rate": 4.838709677419355e-06, + "loss": 0.2087, + "step": 12 + }, + { + "epoch": 0.08387096774193549, + "grad_norm": 1.9465513229370117, + "learning_rate": 5.241935483870968e-06, + "loss": 0.2046, + "step": 13 + }, + { + "epoch": 0.09032258064516129, + "grad_norm": 1.3483728170394897, + "learning_rate": 5.64516129032258e-06, + "loss": 0.1783, + "step": 14 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 1.4068559408187866, + "learning_rate": 6.048387096774194e-06, + "loss": 0.1519, + "step": 15 + }, + { + "epoch": 0.1032258064516129, + "grad_norm": 1.3083986043930054, + "learning_rate": 6.451612903225806e-06, + "loss": 0.1586, + "step": 16 + }, + { + "epoch": 0.10967741935483871, + "grad_norm": 1.6559300422668457, + "learning_rate": 6.854838709677419e-06, + "loss": 0.1476, + "step": 17 + }, + { + "epoch": 0.11612903225806452, + "grad_norm": 1.0691255331039429, + "learning_rate": 7.258064516129033e-06, + "loss": 0.1478, + "step": 18 + }, + { + "epoch": 0.12258064516129032, + "grad_norm": 1.1155110597610474, + "learning_rate": 7.661290322580646e-06, + "loss": 0.1382, + "step": 19 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.9816218018531799, + "learning_rate": 8.064516129032258e-06, + "loss": 0.1204, + "step": 20 + }, + { + "epoch": 0.13548387096774195, + "grad_norm": 1.2463096380233765, + "learning_rate": 8.46774193548387e-06, + "loss": 0.1503, + "step": 21 + }, + { + "epoch": 0.14193548387096774, + "grad_norm": 1.3447906970977783, + "learning_rate": 8.870967741935484e-06, + "loss": 0.1224, + "step": 22 + }, + { + "epoch": 0.14838709677419354, + "grad_norm": 1.1465381383895874, + "learning_rate": 9.274193548387097e-06, + "loss": 0.1374, + "step": 23 + }, + { + "epoch": 0.15483870967741936, + "grad_norm": 1.0978549718856812, + "learning_rate": 9.67741935483871e-06, + "loss": 0.1303, + "step": 24 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 1.1053048372268677, + "learning_rate": 1.0080645161290323e-05, + "loss": 0.1423, + "step": 25 + }, + { + "epoch": 0.16774193548387098, + "grad_norm": 1.0212026834487915, + "learning_rate": 1.0483870967741936e-05, + "loss": 0.1281, + "step": 26 + }, + { + "epoch": 0.17419354838709677, + "grad_norm": 0.9742250442504883, + "learning_rate": 1.0887096774193549e-05, + "loss": 0.1228, + "step": 27 + }, + { + "epoch": 0.18064516129032257, + "grad_norm": 1.1676782369613647, + "learning_rate": 1.129032258064516e-05, + "loss": 0.1429, + "step": 28 + }, + { + "epoch": 0.1870967741935484, + "grad_norm": 1.088600516319275, + "learning_rate": 1.1693548387096775e-05, + "loss": 0.1069, + "step": 29 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 1.3947196006774902, + "learning_rate": 1.2096774193548388e-05, + "loss": 0.1316, + "step": 30 + }, + { + "epoch": 0.2, + "grad_norm": 0.9059141874313354, + "learning_rate": 1.25e-05, + "loss": 0.1121, + "step": 31 + }, + { + "epoch": 0.2064516129032258, + "grad_norm": 1.0918734073638916, + "learning_rate": 1.2903225806451613e-05, + "loss": 0.1521, + "step": 32 + }, + { + "epoch": 0.2129032258064516, + "grad_norm": 1.1550475358963013, + "learning_rate": 1.3306451612903225e-05, + "loss": 0.1251, + "step": 33 + }, + { + "epoch": 0.21935483870967742, + "grad_norm": 1.2110551595687866, + "learning_rate": 1.3709677419354839e-05, + "loss": 0.1325, + "step": 34 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 1.0340098142623901, + "learning_rate": 1.4112903225806454e-05, + "loss": 0.1115, + "step": 35 + }, + { + "epoch": 0.23225806451612904, + "grad_norm": 0.9789180159568787, + "learning_rate": 1.4516129032258066e-05, + "loss": 0.1154, + "step": 36 + }, + { + "epoch": 0.23870967741935484, + "grad_norm": 0.8362810015678406, + "learning_rate": 1.4919354838709679e-05, + "loss": 0.1041, + "step": 37 + }, + { + "epoch": 0.24516129032258063, + "grad_norm": 1.0805575847625732, + "learning_rate": 1.5322580645161292e-05, + "loss": 0.1201, + "step": 38 + }, + { + "epoch": 0.25161290322580643, + "grad_norm": 1.0794912576675415, + "learning_rate": 1.5725806451612903e-05, + "loss": 0.1387, + "step": 39 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 1.0303066968917847, + "learning_rate": 1.6129032258064517e-05, + "loss": 0.1381, + "step": 40 + }, + { + "epoch": 0.2645161290322581, + "grad_norm": 0.8959848284721375, + "learning_rate": 1.653225806451613e-05, + "loss": 0.1251, + "step": 41 + }, + { + "epoch": 0.2709677419354839, + "grad_norm": 1.0856695175170898, + "learning_rate": 1.693548387096774e-05, + "loss": 0.1363, + "step": 42 + }, + { + "epoch": 0.27741935483870966, + "grad_norm": 0.8375802636146545, + "learning_rate": 1.733870967741936e-05, + "loss": 0.1009, + "step": 43 + }, + { + "epoch": 0.2838709677419355, + "grad_norm": 0.9029824733734131, + "learning_rate": 1.774193548387097e-05, + "loss": 0.1006, + "step": 44 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 0.8736345767974854, + "learning_rate": 1.8145161290322583e-05, + "loss": 0.1212, + "step": 45 + }, + { + "epoch": 0.2967741935483871, + "grad_norm": 1.165887713432312, + "learning_rate": 1.8548387096774193e-05, + "loss": 0.1241, + "step": 46 + }, + { + "epoch": 0.3032258064516129, + "grad_norm": 0.8511247634887695, + "learning_rate": 1.8951612903225807e-05, + "loss": 0.0942, + "step": 47 + }, + { + "epoch": 0.3096774193548387, + "grad_norm": 1.0182602405548096, + "learning_rate": 1.935483870967742e-05, + "loss": 0.116, + "step": 48 + }, + { + "epoch": 0.3161290322580645, + "grad_norm": 0.8452662825584412, + "learning_rate": 1.975806451612903e-05, + "loss": 0.1023, + "step": 49 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 0.9583229422569275, + "learning_rate": 2.0161290322580645e-05, + "loss": 0.0956, + "step": 50 + }, + { + "epoch": 0.32903225806451614, + "grad_norm": 0.935484766960144, + "learning_rate": 2.056451612903226e-05, + "loss": 0.1185, + "step": 51 + }, + { + "epoch": 0.33548387096774196, + "grad_norm": 0.9844627380371094, + "learning_rate": 2.0967741935483873e-05, + "loss": 0.0998, + "step": 52 + }, + { + "epoch": 0.3419354838709677, + "grad_norm": 1.0139315128326416, + "learning_rate": 2.1370967741935487e-05, + "loss": 0.0901, + "step": 53 + }, + { + "epoch": 0.34838709677419355, + "grad_norm": 0.844688892364502, + "learning_rate": 2.1774193548387097e-05, + "loss": 0.1158, + "step": 54 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 0.778408408164978, + "learning_rate": 2.217741935483871e-05, + "loss": 0.0884, + "step": 55 + }, + { + "epoch": 0.36129032258064514, + "grad_norm": 0.7307286858558655, + "learning_rate": 2.258064516129032e-05, + "loss": 0.1099, + "step": 56 + }, + { + "epoch": 0.36774193548387096, + "grad_norm": 0.681089460849762, + "learning_rate": 2.2983870967741935e-05, + "loss": 0.0965, + "step": 57 + }, + { + "epoch": 0.3741935483870968, + "grad_norm": 0.7206712365150452, + "learning_rate": 2.338709677419355e-05, + "loss": 0.0898, + "step": 58 + }, + { + "epoch": 0.38064516129032255, + "grad_norm": 0.6326794624328613, + "learning_rate": 2.3790322580645163e-05, + "loss": 0.0896, + "step": 59 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.6684013605117798, + "learning_rate": 2.4193548387096777e-05, + "loss": 0.0859, + "step": 60 + }, + { + "epoch": 0.3935483870967742, + "grad_norm": 0.7839128971099854, + "learning_rate": 2.4596774193548387e-05, + "loss": 0.0918, + "step": 61 + }, + { + "epoch": 0.4, + "grad_norm": 0.7025837302207947, + "learning_rate": 2.5e-05, + "loss": 0.0933, + "step": 62 + }, + { + "epoch": 0.4064516129032258, + "grad_norm": 0.7583072185516357, + "learning_rate": 2.4999801888257584e-05, + "loss": 0.0916, + "step": 63 + }, + { + "epoch": 0.4129032258064516, + "grad_norm": 0.8116795420646667, + "learning_rate": 2.499920755931005e-05, + "loss": 0.0934, + "step": 64 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 0.9053534865379333, + "learning_rate": 2.4998217031996375e-05, + "loss": 0.1116, + "step": 65 + }, + { + "epoch": 0.4258064516129032, + "grad_norm": 0.773985743522644, + "learning_rate": 2.4996830337714163e-05, + "loss": 0.0874, + "step": 66 + }, + { + "epoch": 0.432258064516129, + "grad_norm": 0.8468173146247864, + "learning_rate": 2.4995047520418692e-05, + "loss": 0.0954, + "step": 67 + }, + { + "epoch": 0.43870967741935485, + "grad_norm": 0.7126619815826416, + "learning_rate": 2.4992868636621474e-05, + "loss": 0.1017, + "step": 68 + }, + { + "epoch": 0.44516129032258067, + "grad_norm": 0.7975043654441833, + "learning_rate": 2.4990293755388524e-05, + "loss": 0.1086, + "step": 69 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 0.8055579662322998, + "learning_rate": 2.4987322958338095e-05, + "loss": 0.0836, + "step": 70 + }, + { + "epoch": 0.45806451612903226, + "grad_norm": 0.6494209170341492, + "learning_rate": 2.4983956339638158e-05, + "loss": 0.0883, + "step": 71 + }, + { + "epoch": 0.4645161290322581, + "grad_norm": 0.6997829675674438, + "learning_rate": 2.4980194006003392e-05, + "loss": 0.0763, + "step": 72 + }, + { + "epoch": 0.47096774193548385, + "grad_norm": 0.596174418926239, + "learning_rate": 2.4976036076691787e-05, + "loss": 0.0871, + "step": 73 + }, + { + "epoch": 0.4774193548387097, + "grad_norm": 0.6535652279853821, + "learning_rate": 2.4971482683500884e-05, + "loss": 0.0869, + "step": 74 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.8003737926483154, + "learning_rate": 2.4966533970763586e-05, + "loss": 0.1086, + "step": 75 + }, + { + "epoch": 0.49032258064516127, + "grad_norm": 0.6992926001548767, + "learning_rate": 2.496119009534359e-05, + "loss": 0.0822, + "step": 76 + }, + { + "epoch": 0.4967741935483871, + "grad_norm": 0.6500689387321472, + "learning_rate": 2.4955451226630412e-05, + "loss": 0.0876, + "step": 77 + }, + { + "epoch": 0.5032258064516129, + "grad_norm": 0.7626132369041443, + "learning_rate": 2.4949317546534018e-05, + "loss": 0.0911, + "step": 78 + }, + { + "epoch": 0.5096774193548387, + "grad_norm": 0.6485949158668518, + "learning_rate": 2.4942789249479054e-05, + "loss": 0.0914, + "step": 79 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 0.692364513874054, + "learning_rate": 2.493586654239869e-05, + "loss": 0.1074, + "step": 80 + }, + { + "epoch": 0.5225806451612903, + "grad_norm": 0.7383131980895996, + "learning_rate": 2.4928549644728057e-05, + "loss": 0.085, + "step": 81 + }, + { + "epoch": 0.5290322580645161, + "grad_norm": 0.6585950255393982, + "learning_rate": 2.492083878839729e-05, + "loss": 0.0795, + "step": 82 + }, + { + "epoch": 0.535483870967742, + "grad_norm": 0.7683681845664978, + "learning_rate": 2.491273421782417e-05, + "loss": 0.073, + "step": 83 + }, + { + "epoch": 0.5419354838709678, + "grad_norm": 0.5386450290679932, + "learning_rate": 2.4904236189906406e-05, + "loss": 0.0814, + "step": 84 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 0.725712239742279, + "learning_rate": 2.489534497401345e-05, + "loss": 0.0896, + "step": 85 + }, + { + "epoch": 0.5548387096774193, + "grad_norm": 0.8596577644348145, + "learning_rate": 2.488606085197799e-05, + "loss": 0.0816, + "step": 86 + }, + { + "epoch": 0.5612903225806452, + "grad_norm": 0.7653164863586426, + "learning_rate": 2.4876384118086992e-05, + "loss": 0.1078, + "step": 87 + }, + { + "epoch": 0.567741935483871, + "grad_norm": 0.713628351688385, + "learning_rate": 2.48663150790724e-05, + "loss": 0.0887, + "step": 88 + }, + { + "epoch": 0.5741935483870968, + "grad_norm": 0.5724640488624573, + "learning_rate": 2.4855854054101395e-05, + "loss": 0.0849, + "step": 89 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.6235289573669434, + "learning_rate": 2.484500137476627e-05, + "loss": 0.0875, + "step": 90 + }, + { + "epoch": 0.5870967741935483, + "grad_norm": 0.785372793674469, + "learning_rate": 2.483375738507395e-05, + "loss": 0.1225, + "step": 91 + }, + { + "epoch": 0.5935483870967742, + "grad_norm": 0.6431748867034912, + "learning_rate": 2.4822122441435047e-05, + "loss": 0.0913, + "step": 92 + }, + { + "epoch": 0.6, + "grad_norm": 0.8031719923019409, + "learning_rate": 2.4810096912652604e-05, + "loss": 0.102, + "step": 93 + }, + { + "epoch": 0.6064516129032258, + "grad_norm": 0.5750744938850403, + "learning_rate": 2.4797681179910363e-05, + "loss": 0.0754, + "step": 94 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 0.7892565727233887, + "learning_rate": 2.4784875636760727e-05, + "loss": 0.0825, + "step": 95 + }, + { + "epoch": 0.6193548387096774, + "grad_norm": 0.7932739853858948, + "learning_rate": 2.4771680689112244e-05, + "loss": 0.1262, + "step": 96 + }, + { + "epoch": 0.6258064516129033, + "grad_norm": 0.7647889852523804, + "learning_rate": 2.4758096755216763e-05, + "loss": 0.1083, + "step": 97 + }, + { + "epoch": 0.632258064516129, + "grad_norm": 0.9550963640213013, + "learning_rate": 2.474412426565618e-05, + "loss": 0.0828, + "step": 98 + }, + { + "epoch": 0.6387096774193548, + "grad_norm": 0.6981013417243958, + "learning_rate": 2.4729763663328774e-05, + "loss": 0.0943, + "step": 99 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.8088532090187073, + "learning_rate": 2.4715015403435176e-05, + "loss": 0.0954, + "step": 100 + }, + { + "epoch": 0.6516129032258065, + "grad_norm": 0.9130911231040955, + "learning_rate": 2.4699879953463945e-05, + "loss": 0.0973, + "step": 101 + }, + { + "epoch": 0.6580645161290323, + "grad_norm": 0.791867196559906, + "learning_rate": 2.468435779317673e-05, + "loss": 0.0946, + "step": 102 + }, + { + "epoch": 0.6645161290322581, + "grad_norm": 0.6049063205718994, + "learning_rate": 2.466844941459309e-05, + "loss": 0.0797, + "step": 103 + }, + { + "epoch": 0.6709677419354839, + "grad_norm": 0.6488558053970337, + "learning_rate": 2.4652155321974883e-05, + "loss": 0.1004, + "step": 104 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.7218672633171082, + "learning_rate": 2.4635476031810284e-05, + "loss": 0.0943, + "step": 105 + }, + { + "epoch": 0.6838709677419355, + "grad_norm": 0.7997153997421265, + "learning_rate": 2.4618412072797407e-05, + "loss": 0.0831, + "step": 106 + }, + { + "epoch": 0.6903225806451613, + "grad_norm": 0.8165119886398315, + "learning_rate": 2.4600963985827555e-05, + "loss": 0.0919, + "step": 107 + }, + { + "epoch": 0.6967741935483871, + "grad_norm": 0.704238772392273, + "learning_rate": 2.458313232396808e-05, + "loss": 0.0778, + "step": 108 + }, + { + "epoch": 0.7032258064516129, + "grad_norm": 0.6857476234436035, + "learning_rate": 2.456491765244483e-05, + "loss": 0.0914, + "step": 109 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 0.7254015803337097, + "learning_rate": 2.4546320548624264e-05, + "loss": 0.1102, + "step": 110 + }, + { + "epoch": 0.7161290322580646, + "grad_norm": 0.6534197330474854, + "learning_rate": 2.4527341601995115e-05, + "loss": 0.0841, + "step": 111 + }, + { + "epoch": 0.7225806451612903, + "grad_norm": 0.6944810152053833, + "learning_rate": 2.450798141414974e-05, + "loss": 0.1067, + "step": 112 + }, + { + "epoch": 0.7290322580645161, + "grad_norm": 0.7583324909210205, + "learning_rate": 2.448824059876503e-05, + "loss": 0.0979, + "step": 113 + }, + { + "epoch": 0.7354838709677419, + "grad_norm": 0.5010597705841064, + "learning_rate": 2.4468119781582948e-05, + "loss": 0.069, + "step": 114 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 0.5694583058357239, + "learning_rate": 2.444761960039072e-05, + "loss": 0.0687, + "step": 115 + }, + { + "epoch": 0.7483870967741936, + "grad_norm": 0.803371787071228, + "learning_rate": 2.442674070500061e-05, + "loss": 0.123, + "step": 116 + }, + { + "epoch": 0.7548387096774194, + "grad_norm": 0.6523027420043945, + "learning_rate": 2.4405483757229314e-05, + "loss": 0.0917, + "step": 117 + }, + { + "epoch": 0.7612903225806451, + "grad_norm": 0.6718930006027222, + "learning_rate": 2.438384943087698e-05, + "loss": 0.0854, + "step": 118 + }, + { + "epoch": 0.7677419354838709, + "grad_norm": 0.5987946391105652, + "learning_rate": 2.4361838411705865e-05, + "loss": 0.0941, + "step": 119 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.6336897015571594, + "learning_rate": 2.4339451397418584e-05, + "loss": 0.0885, + "step": 120 + }, + { + "epoch": 0.7806451612903226, + "grad_norm": 0.7484766840934753, + "learning_rate": 2.4316689097636008e-05, + "loss": 0.0966, + "step": 121 + }, + { + "epoch": 0.7870967741935484, + "grad_norm": 0.7096850275993347, + "learning_rate": 2.4293552233874754e-05, + "loss": 0.0843, + "step": 122 + }, + { + "epoch": 0.7935483870967742, + "grad_norm": 0.6953093409538269, + "learning_rate": 2.4270041539524322e-05, + "loss": 0.079, + "step": 123 + }, + { + "epoch": 0.8, + "grad_norm": 0.6068540215492249, + "learning_rate": 2.4246157759823855e-05, + "loss": 0.0846, + "step": 124 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 0.5982446670532227, + "learning_rate": 2.4221901651838506e-05, + "loss": 0.0864, + "step": 125 + }, + { + "epoch": 0.8129032258064516, + "grad_norm": 0.6706437468528748, + "learning_rate": 2.419727398443545e-05, + "loss": 0.0796, + "step": 126 + }, + { + "epoch": 0.8193548387096774, + "grad_norm": 0.6994534730911255, + "learning_rate": 2.417227553825949e-05, + "loss": 0.0775, + "step": 127 + }, + { + "epoch": 0.8258064516129032, + "grad_norm": 0.6935513615608215, + "learning_rate": 2.4146907105708357e-05, + "loss": 0.1003, + "step": 128 + }, + { + "epoch": 0.832258064516129, + "grad_norm": 0.6945312023162842, + "learning_rate": 2.4121169490907544e-05, + "loss": 0.0901, + "step": 129 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 0.6928992867469788, + "learning_rate": 2.409506350968485e-05, + "loss": 0.0991, + "step": 130 + }, + { + "epoch": 0.8451612903225807, + "grad_norm": 0.6358478665351868, + "learning_rate": 2.4068589989544498e-05, + "loss": 0.0877, + "step": 131 + }, + { + "epoch": 0.8516129032258064, + "grad_norm": 0.6835708022117615, + "learning_rate": 2.404174976964092e-05, + "loss": 0.1058, + "step": 132 + }, + { + "epoch": 0.8580645161290322, + "grad_norm": 0.6372717022895813, + "learning_rate": 2.4014543700752156e-05, + "loss": 0.0899, + "step": 133 + }, + { + "epoch": 0.864516129032258, + "grad_norm": 0.671310544013977, + "learning_rate": 2.3986972645252883e-05, + "loss": 0.0744, + "step": 134 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.5800638794898987, + "learning_rate": 2.395903747708707e-05, + "loss": 0.0818, + "step": 135 + }, + { + "epoch": 0.8774193548387097, + "grad_norm": 0.5142645835876465, + "learning_rate": 2.39307390817403e-05, + "loss": 0.0811, + "step": 136 + }, + { + "epoch": 0.8838709677419355, + "grad_norm": 0.7107434868812561, + "learning_rate": 2.390207835621167e-05, + "loss": 0.0876, + "step": 137 + }, + { + "epoch": 0.8903225806451613, + "grad_norm": 0.6197046637535095, + "learning_rate": 2.3873056208985383e-05, + "loss": 0.0907, + "step": 138 + }, + { + "epoch": 0.896774193548387, + "grad_norm": 0.8946641087532043, + "learning_rate": 2.384367356000195e-05, + "loss": 0.0867, + "step": 139 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.6002138257026672, + "learning_rate": 2.3813931340629018e-05, + "loss": 0.0766, + "step": 140 + }, + { + "epoch": 0.9096774193548387, + "grad_norm": 0.4771173298358917, + "learning_rate": 2.378383049363184e-05, + "loss": 0.074, + "step": 141 + }, + { + "epoch": 0.9161290322580645, + "grad_norm": 0.6188220381736755, + "learning_rate": 2.3753371973143433e-05, + "loss": 0.0823, + "step": 142 + }, + { + "epoch": 0.9225806451612903, + "grad_norm": 0.509564995765686, + "learning_rate": 2.3722556744634272e-05, + "loss": 0.069, + "step": 143 + }, + { + "epoch": 0.9290322580645162, + "grad_norm": 0.5153804421424866, + "learning_rate": 2.3691385784881743e-05, + "loss": 0.064, + "step": 144 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 0.5935696363449097, + "learning_rate": 2.3659860081939146e-05, + "loss": 0.0827, + "step": 145 + }, + { + "epoch": 0.9419354838709677, + "grad_norm": 0.4910190999507904, + "learning_rate": 2.3627980635104396e-05, + "loss": 0.0804, + "step": 146 + }, + { + "epoch": 0.9483870967741935, + "grad_norm": 0.6524127721786499, + "learning_rate": 2.359574845488833e-05, + "loss": 0.0956, + "step": 147 + }, + { + "epoch": 0.9548387096774194, + "grad_norm": 0.6664571762084961, + "learning_rate": 2.356316456298269e-05, + "loss": 0.0937, + "step": 148 + }, + { + "epoch": 0.9612903225806452, + "grad_norm": 0.579138994216919, + "learning_rate": 2.353022999222774e-05, + "loss": 0.0936, + "step": 149 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.6929976940155029, + "learning_rate": 2.3496945786579503e-05, + "loss": 0.1197, + "step": 150 + }, + { + "epoch": 0.9741935483870968, + "grad_norm": 0.6236998438835144, + "learning_rate": 2.3463313001076696e-05, + "loss": 0.0958, + "step": 151 + }, + { + "epoch": 0.9806451612903225, + "grad_norm": 0.4676724672317505, + "learning_rate": 2.342933270180728e-05, + "loss": 0.0713, + "step": 152 + }, + { + "epoch": 0.9870967741935484, + "grad_norm": 0.4456840753555298, + "learning_rate": 2.3395005965874657e-05, + "loss": 0.0784, + "step": 153 + }, + { + "epoch": 0.9935483870967742, + "grad_norm": 0.5712344646453857, + "learning_rate": 2.336033388136355e-05, + "loss": 0.0935, + "step": 154 + }, + { + "epoch": 1.0, + "grad_norm": 0.4726645350456238, + "learning_rate": 2.3325317547305485e-05, + "loss": 0.0712, + "step": 155 + }, + { + "epoch": 1.0064516129032257, + "grad_norm": 0.48649105429649353, + "learning_rate": 2.3289958073643976e-05, + "loss": 0.0584, + "step": 156 + }, + { + "epoch": 1.0129032258064516, + "grad_norm": 0.5159472823143005, + "learning_rate": 2.3254256581199336e-05, + "loss": 0.0579, + "step": 157 + }, + { + "epoch": 1.0193548387096774, + "grad_norm": 0.5775710344314575, + "learning_rate": 2.3218214201633136e-05, + "loss": 0.0676, + "step": 158 + }, + { + "epoch": 1.0258064516129033, + "grad_norm": 0.5070593357086182, + "learning_rate": 2.318183207741237e-05, + "loss": 0.0794, + "step": 159 + }, + { + "epoch": 1.032258064516129, + "grad_norm": 0.38065212965011597, + "learning_rate": 2.3145111361773186e-05, + "loss": 0.051, + "step": 160 + }, + { + "epoch": 1.038709677419355, + "grad_norm": 0.562282383441925, + "learning_rate": 2.310805321868439e-05, + "loss": 0.0753, + "step": 161 + }, + { + "epoch": 1.0451612903225806, + "grad_norm": 0.49883219599723816, + "learning_rate": 2.30706588228105e-05, + "loss": 0.0554, + "step": 162 + }, + { + "epoch": 1.0516129032258064, + "grad_norm": 0.5298740863800049, + "learning_rate": 2.303292935947455e-05, + "loss": 0.0602, + "step": 163 + }, + { + "epoch": 1.0580645161290323, + "grad_norm": 0.5768100619316101, + "learning_rate": 2.2994866024620486e-05, + "loss": 0.0585, + "step": 164 + }, + { + "epoch": 1.064516129032258, + "grad_norm": 0.5079744458198547, + "learning_rate": 2.2956470024775294e-05, + "loss": 0.0459, + "step": 165 + }, + { + "epoch": 1.070967741935484, + "grad_norm": 0.5212790966033936, + "learning_rate": 2.291774257701072e-05, + "loss": 0.0619, + "step": 166 + }, + { + "epoch": 1.0774193548387097, + "grad_norm": 0.5063428282737732, + "learning_rate": 2.2878684908904707e-05, + "loss": 0.0609, + "step": 167 + }, + { + "epoch": 1.0838709677419356, + "grad_norm": 0.6523650288581848, + "learning_rate": 2.2839298258502483e-05, + "loss": 0.067, + "step": 168 + }, + { + "epoch": 1.0903225806451613, + "grad_norm": 0.57984459400177, + "learning_rate": 2.279958387427732e-05, + "loss": 0.0703, + "step": 169 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 0.6002654433250427, + "learning_rate": 2.2759543015090955e-05, + "loss": 0.074, + "step": 170 + }, + { + "epoch": 1.103225806451613, + "grad_norm": 0.3899862766265869, + "learning_rate": 2.2719176950153688e-05, + "loss": 0.0461, + "step": 171 + }, + { + "epoch": 1.1096774193548387, + "grad_norm": 0.5003259778022766, + "learning_rate": 2.267848695898416e-05, + "loss": 0.0613, + "step": 172 + }, + { + "epoch": 1.1161290322580646, + "grad_norm": 0.558653712272644, + "learning_rate": 2.2637474331368766e-05, + "loss": 0.0658, + "step": 173 + }, + { + "epoch": 1.1225806451612903, + "grad_norm": 0.5032625794410706, + "learning_rate": 2.2596140367320813e-05, + "loss": 0.0564, + "step": 174 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 0.5199857950210571, + "learning_rate": 2.2554486377039282e-05, + "loss": 0.0587, + "step": 175 + }, + { + "epoch": 1.135483870967742, + "grad_norm": 0.6159687042236328, + "learning_rate": 2.251251368086731e-05, + "loss": 0.0585, + "step": 176 + }, + { + "epoch": 1.1419354838709677, + "grad_norm": 0.5216447114944458, + "learning_rate": 2.2470223609250328e-05, + "loss": 0.0501, + "step": 177 + }, + { + "epoch": 1.1483870967741936, + "grad_norm": 0.49131321907043457, + "learning_rate": 2.24276175026939e-05, + "loss": 0.053, + "step": 178 + }, + { + "epoch": 1.1548387096774193, + "grad_norm": 0.8894760608673096, + "learning_rate": 2.238469671172123e-05, + "loss": 0.0854, + "step": 179 + }, + { + "epoch": 1.1612903225806452, + "grad_norm": 0.6628456711769104, + "learning_rate": 2.2341462596830354e-05, + "loss": 0.064, + "step": 180 + }, + { + "epoch": 1.167741935483871, + "grad_norm": 0.4577731788158417, + "learning_rate": 2.229791652845099e-05, + "loss": 0.0543, + "step": 181 + }, + { + "epoch": 1.1741935483870969, + "grad_norm": 0.49301421642303467, + "learning_rate": 2.225405988690115e-05, + "loss": 0.0598, + "step": 182 + }, + { + "epoch": 1.1806451612903226, + "grad_norm": 0.523009717464447, + "learning_rate": 2.220989406234333e-05, + "loss": 0.0752, + "step": 183 + }, + { + "epoch": 1.1870967741935483, + "grad_norm": 0.7591210007667542, + "learning_rate": 2.2165420454740494e-05, + "loss": 0.0643, + "step": 184 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 0.532319962978363, + "learning_rate": 2.2120640473811656e-05, + "loss": 0.0464, + "step": 185 + }, + { + "epoch": 1.2, + "grad_norm": 0.4334715008735657, + "learning_rate": 2.2075555538987227e-05, + "loss": 0.0669, + "step": 186 + }, + { + "epoch": 1.206451612903226, + "grad_norm": 0.4052492380142212, + "learning_rate": 2.2030167079364007e-05, + "loss": 0.0506, + "step": 187 + }, + { + "epoch": 1.2129032258064516, + "grad_norm": 0.763782799243927, + "learning_rate": 2.1984476533659888e-05, + "loss": 0.0477, + "step": 188 + }, + { + "epoch": 1.2193548387096773, + "grad_norm": 0.46810972690582275, + "learning_rate": 2.1938485350168248e-05, + "loss": 0.055, + "step": 189 + }, + { + "epoch": 1.2258064516129032, + "grad_norm": 0.4722144901752472, + "learning_rate": 2.1892194986712045e-05, + "loss": 0.053, + "step": 190 + }, + { + "epoch": 1.232258064516129, + "grad_norm": 0.5537333488464355, + "learning_rate": 2.1845606910597616e-05, + "loss": 0.0686, + "step": 191 + }, + { + "epoch": 1.238709677419355, + "grad_norm": 0.5123704671859741, + "learning_rate": 2.179872259856814e-05, + "loss": 0.0627, + "step": 192 + }, + { + "epoch": 1.2451612903225806, + "grad_norm": 0.5691571831703186, + "learning_rate": 2.175154353675686e-05, + "loss": 0.0601, + "step": 193 + }, + { + "epoch": 1.2516129032258063, + "grad_norm": 0.4747653007507324, + "learning_rate": 2.1704071220639965e-05, + "loss": 0.0551, + "step": 194 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 0.5692989826202393, + "learning_rate": 2.1656307154989174e-05, + "loss": 0.0482, + "step": 195 + }, + { + "epoch": 1.2645161290322582, + "grad_norm": 0.7472412586212158, + "learning_rate": 2.1608252853824047e-05, + "loss": 0.0609, + "step": 196 + }, + { + "epoch": 1.270967741935484, + "grad_norm": 0.568708062171936, + "learning_rate": 2.1559909840364e-05, + "loss": 0.0572, + "step": 197 + }, + { + "epoch": 1.2774193548387096, + "grad_norm": 0.6601235866546631, + "learning_rate": 2.1511279646980016e-05, + "loss": 0.0777, + "step": 198 + }, + { + "epoch": 1.2838709677419355, + "grad_norm": 0.429850697517395, + "learning_rate": 2.1462363815146065e-05, + "loss": 0.0454, + "step": 199 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 0.7702894806861877, + "learning_rate": 2.1413163895390254e-05, + "loss": 0.0655, + "step": 200 + }, + { + "epoch": 1.2967741935483872, + "grad_norm": 0.4497153162956238, + "learning_rate": 2.1363681447245686e-05, + "loss": 0.0512, + "step": 201 + }, + { + "epoch": 1.303225806451613, + "grad_norm": 0.5631290674209595, + "learning_rate": 2.1313918039200995e-05, + "loss": 0.0645, + "step": 202 + }, + { + "epoch": 1.3096774193548386, + "grad_norm": 0.7414901852607727, + "learning_rate": 2.1263875248650662e-05, + "loss": 0.0561, + "step": 203 + }, + { + "epoch": 1.3161290322580645, + "grad_norm": 0.5053102970123291, + "learning_rate": 2.121355466184499e-05, + "loss": 0.0608, + "step": 204 + }, + { + "epoch": 1.3225806451612903, + "grad_norm": 10.00545597076416, + "learning_rate": 2.116295787383985e-05, + "loss": 0.0826, + "step": 205 + }, + { + "epoch": 1.3290322580645162, + "grad_norm": 0.6418637037277222, + "learning_rate": 2.1112086488446085e-05, + "loss": 0.0743, + "step": 206 + }, + { + "epoch": 1.335483870967742, + "grad_norm": 0.4627211391925812, + "learning_rate": 2.1060942118178706e-05, + "loss": 0.0476, + "step": 207 + }, + { + "epoch": 1.3419354838709676, + "grad_norm": 0.5375849604606628, + "learning_rate": 2.1009526384205767e-05, + "loss": 0.048, + "step": 208 + }, + { + "epoch": 1.3483870967741935, + "grad_norm": 0.606073796749115, + "learning_rate": 2.095784091629697e-05, + "loss": 0.0704, + "step": 209 + }, + { + "epoch": 1.3548387096774195, + "grad_norm": 0.44339242577552795, + "learning_rate": 2.0905887352772004e-05, + "loss": 0.0516, + "step": 210 + }, + { + "epoch": 1.3612903225806452, + "grad_norm": 0.6248610019683838, + "learning_rate": 2.085366734044864e-05, + "loss": 0.066, + "step": 211 + }, + { + "epoch": 1.367741935483871, + "grad_norm": 0.5914815664291382, + "learning_rate": 2.080118253459049e-05, + "loss": 0.0611, + "step": 212 + }, + { + "epoch": 1.3741935483870968, + "grad_norm": 0.45894381403923035, + "learning_rate": 2.0748434598854573e-05, + "loss": 0.0501, + "step": 213 + }, + { + "epoch": 1.3806451612903226, + "grad_norm": 0.49100032448768616, + "learning_rate": 2.0695425205238557e-05, + "loss": 0.0552, + "step": 214 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.528611958026886, + "learning_rate": 2.0642156034027783e-05, + "loss": 0.0639, + "step": 215 + }, + { + "epoch": 1.3935483870967742, + "grad_norm": 0.4487656056880951, + "learning_rate": 2.0588628773741973e-05, + "loss": 0.0435, + "step": 216 + }, + { + "epoch": 1.4, + "grad_norm": 0.547444224357605, + "learning_rate": 2.0534845121081742e-05, + "loss": 0.0547, + "step": 217 + }, + { + "epoch": 1.4064516129032258, + "grad_norm": 0.5207445621490479, + "learning_rate": 2.0480806780874794e-05, + "loss": 0.0574, + "step": 218 + }, + { + "epoch": 1.4129032258064516, + "grad_norm": 0.5784499049186707, + "learning_rate": 2.0426515466021887e-05, + "loss": 0.0608, + "step": 219 + }, + { + "epoch": 1.4193548387096775, + "grad_norm": 0.7198527455329895, + "learning_rate": 2.0371972897442532e-05, + "loss": 0.0639, + "step": 220 + }, + { + "epoch": 1.4258064516129032, + "grad_norm": 0.4550151228904724, + "learning_rate": 2.031718080402046e-05, + "loss": 0.0547, + "step": 221 + }, + { + "epoch": 1.432258064516129, + "grad_norm": 0.48588842153549194, + "learning_rate": 2.026214092254881e-05, + "loss": 0.0603, + "step": 222 + }, + { + "epoch": 1.4387096774193548, + "grad_norm": 0.5426737666130066, + "learning_rate": 2.0206854997675072e-05, + "loss": 0.0616, + "step": 223 + }, + { + "epoch": 1.4451612903225808, + "grad_norm": 0.5034387707710266, + "learning_rate": 2.0151324781845787e-05, + "loss": 0.0644, + "step": 224 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 0.5200063586235046, + "learning_rate": 2.0095552035251007e-05, + "loss": 0.0596, + "step": 225 + }, + { + "epoch": 1.4580645161290322, + "grad_norm": 0.4462428390979767, + "learning_rate": 2.0039538525768496e-05, + "loss": 0.0523, + "step": 226 + }, + { + "epoch": 1.4645161290322581, + "grad_norm": 0.5513397455215454, + "learning_rate": 1.9983286028907687e-05, + "loss": 0.0528, + "step": 227 + }, + { + "epoch": 1.4709677419354839, + "grad_norm": 0.44743800163269043, + "learning_rate": 1.992679632775341e-05, + "loss": 0.0649, + "step": 228 + }, + { + "epoch": 1.4774193548387098, + "grad_norm": 0.4505648910999298, + "learning_rate": 1.9870071212909357e-05, + "loss": 0.0453, + "step": 229 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 0.48718520998954773, + "learning_rate": 1.9813112482441345e-05, + "loss": 0.0664, + "step": 230 + }, + { + "epoch": 1.4903225806451612, + "grad_norm": 0.4392196834087372, + "learning_rate": 1.9755921941820314e-05, + "loss": 0.0504, + "step": 231 + }, + { + "epoch": 1.4967741935483871, + "grad_norm": 0.5312716364860535, + "learning_rate": 1.9698501403865083e-05, + "loss": 0.0699, + "step": 232 + }, + { + "epoch": 1.5032258064516129, + "grad_norm": 0.5387852787971497, + "learning_rate": 1.9640852688684904e-05, + "loss": 0.071, + "step": 233 + }, + { + "epoch": 1.5096774193548388, + "grad_norm": 0.4734801650047302, + "learning_rate": 1.9582977623621766e-05, + "loss": 0.0561, + "step": 234 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 0.4738084375858307, + "learning_rate": 1.9524878043192463e-05, + "loss": 0.0545, + "step": 235 + }, + { + "epoch": 1.5225806451612902, + "grad_norm": 0.5166822671890259, + "learning_rate": 1.9466555789030456e-05, + "loss": 0.0708, + "step": 236 + }, + { + "epoch": 1.5290322580645161, + "grad_norm": 0.5719185471534729, + "learning_rate": 1.9408012709827485e-05, + "loss": 0.073, + "step": 237 + }, + { + "epoch": 1.535483870967742, + "grad_norm": 0.5363075733184814, + "learning_rate": 1.934925066127498e-05, + "loss": 0.0581, + "step": 238 + }, + { + "epoch": 1.5419354838709678, + "grad_norm": 0.551699697971344, + "learning_rate": 1.9290271506005236e-05, + "loss": 0.0598, + "step": 239 + }, + { + "epoch": 1.5483870967741935, + "grad_norm": 0.5568850636482239, + "learning_rate": 1.9231077113532363e-05, + "loss": 0.0471, + "step": 240 + }, + { + "epoch": 1.5548387096774192, + "grad_norm": 0.5048776268959045, + "learning_rate": 1.917166936019304e-05, + "loss": 0.0613, + "step": 241 + }, + { + "epoch": 1.5612903225806452, + "grad_norm": 0.516986608505249, + "learning_rate": 1.911205012908703e-05, + "loss": 0.0678, + "step": 242 + }, + { + "epoch": 1.567741935483871, + "grad_norm": 0.48142287135124207, + "learning_rate": 1.90522213100175e-05, + "loss": 0.0557, + "step": 243 + }, + { + "epoch": 1.5741935483870968, + "grad_norm": 0.4997798800468445, + "learning_rate": 1.8992184799431095e-05, + "loss": 0.042, + "step": 244 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 0.5074776411056519, + "learning_rate": 1.893194250035786e-05, + "loss": 0.073, + "step": 245 + }, + { + "epoch": 1.5870967741935482, + "grad_norm": 0.5136696696281433, + "learning_rate": 1.8871496322350883e-05, + "loss": 0.0547, + "step": 246 + }, + { + "epoch": 1.5935483870967742, + "grad_norm": 0.6183574795722961, + "learning_rate": 1.881084818142579e-05, + "loss": 0.0708, + "step": 247 + }, + { + "epoch": 1.6, + "grad_norm": 0.5576770901679993, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.0629, + "step": 248 + }, + { + "epoch": 1.6064516129032258, + "grad_norm": 0.4211249351501465, + "learning_rate": 1.868895370683179e-05, + "loss": 0.0544, + "step": 249 + }, + { + "epoch": 1.6129032258064515, + "grad_norm": 0.43533676862716675, + "learning_rate": 1.862771123695917e-05, + "loss": 0.0568, + "step": 250 + }, + { + "epoch": 1.6193548387096774, + "grad_norm": 0.48171842098236084, + "learning_rate": 1.8566274531638516e-05, + "loss": 0.0524, + "step": 251 + }, + { + "epoch": 1.6258064516129034, + "grad_norm": 0.459471195936203, + "learning_rate": 1.850464553828307e-05, + "loss": 0.0471, + "step": 252 + }, + { + "epoch": 1.632258064516129, + "grad_norm": 0.5311537384986877, + "learning_rate": 1.844282621040119e-05, + "loss": 0.0766, + "step": 253 + }, + { + "epoch": 1.6387096774193548, + "grad_norm": 0.5022658109664917, + "learning_rate": 1.838081850753445e-05, + "loss": 0.0579, + "step": 254 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 0.5516560077667236, + "learning_rate": 1.8318624395195483e-05, + "loss": 0.0616, + "step": 255 + }, + { + "epoch": 1.6516129032258065, + "grad_norm": 0.4552045166492462, + "learning_rate": 1.825624584480573e-05, + "loss": 0.0512, + "step": 256 + }, + { + "epoch": 1.6580645161290324, + "grad_norm": 0.5871717929840088, + "learning_rate": 1.8193684833632925e-05, + "loss": 0.0641, + "step": 257 + }, + { + "epoch": 1.664516129032258, + "grad_norm": 0.46038615703582764, + "learning_rate": 1.8130943344728414e-05, + "loss": 0.0459, + "step": 258 + }, + { + "epoch": 1.6709677419354838, + "grad_norm": 0.5282014608383179, + "learning_rate": 1.8068023366864305e-05, + "loss": 0.0569, + "step": 259 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 0.3797317147254944, + "learning_rate": 1.800492689447043e-05, + "loss": 0.0459, + "step": 260 + }, + { + "epoch": 1.6838709677419355, + "grad_norm": 0.5863360166549683, + "learning_rate": 1.7941655927571125e-05, + "loss": 0.0695, + "step": 261 + }, + { + "epoch": 1.6903225806451614, + "grad_norm": 0.5562090873718262, + "learning_rate": 1.7878212471721824e-05, + "loss": 0.0554, + "step": 262 + }, + { + "epoch": 1.696774193548387, + "grad_norm": 0.5164937973022461, + "learning_rate": 1.781459853794551e-05, + "loss": 0.0542, + "step": 263 + }, + { + "epoch": 1.7032258064516128, + "grad_norm": 0.5710752010345459, + "learning_rate": 1.7750816142668937e-05, + "loss": 0.0641, + "step": 264 + }, + { + "epoch": 1.7096774193548387, + "grad_norm": 0.43633976578712463, + "learning_rate": 1.7686867307658743e-05, + "loss": 0.0498, + "step": 265 + }, + { + "epoch": 1.7161290322580647, + "grad_norm": 0.5214335322380066, + "learning_rate": 1.7622754059957343e-05, + "loss": 0.054, + "step": 266 + }, + { + "epoch": 1.7225806451612904, + "grad_norm": 0.6009476780891418, + "learning_rate": 1.7558478431818702e-05, + "loss": 0.0538, + "step": 267 + }, + { + "epoch": 1.729032258064516, + "grad_norm": 0.5809276700019836, + "learning_rate": 1.749404246064388e-05, + "loss": 0.0751, + "step": 268 + }, + { + "epoch": 1.7354838709677418, + "grad_norm": 0.5733875632286072, + "learning_rate": 1.7429448188916483e-05, + "loss": 0.0685, + "step": 269 + }, + { + "epoch": 1.7419354838709677, + "grad_norm": 0.3861143887042999, + "learning_rate": 1.7364697664137912e-05, + "loss": 0.044, + "step": 270 + }, + { + "epoch": 1.7483870967741937, + "grad_norm": 0.8718386292457581, + "learning_rate": 1.7299792938762443e-05, + "loss": 0.0807, + "step": 271 + }, + { + "epoch": 1.7548387096774194, + "grad_norm": 0.6809967160224915, + "learning_rate": 1.72347360701322e-05, + "loss": 0.0698, + "step": 272 + }, + { + "epoch": 1.761290322580645, + "grad_norm": 0.45045140385627747, + "learning_rate": 1.7169529120411922e-05, + "loss": 0.0552, + "step": 273 + }, + { + "epoch": 1.7677419354838708, + "grad_norm": 0.46889108419418335, + "learning_rate": 1.710417415652359e-05, + "loss": 0.0576, + "step": 274 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.5053566098213196, + "learning_rate": 1.7038673250080934e-05, + "loss": 0.0535, + "step": 275 + }, + { + "epoch": 1.7806451612903227, + "grad_norm": 0.4330599904060364, + "learning_rate": 1.6973028477323742e-05, + "loss": 0.0518, + "step": 276 + }, + { + "epoch": 1.7870967741935484, + "grad_norm": 0.4866834580898285, + "learning_rate": 1.6907241919052068e-05, + "loss": 0.051, + "step": 277 + }, + { + "epoch": 1.793548387096774, + "grad_norm": 0.6048698425292969, + "learning_rate": 1.6841315660560252e-05, + "loss": 0.0683, + "step": 278 + }, + { + "epoch": 1.8, + "grad_norm": 0.39490604400634766, + "learning_rate": 1.677525179157086e-05, + "loss": 0.0515, + "step": 279 + }, + { + "epoch": 1.8064516129032258, + "grad_norm": 0.4544709324836731, + "learning_rate": 1.6709052406168393e-05, + "loss": 0.0624, + "step": 280 + }, + { + "epoch": 1.8129032258064517, + "grad_norm": 0.5158767700195312, + "learning_rate": 1.664271960273295e-05, + "loss": 0.0575, + "step": 281 + }, + { + "epoch": 1.8193548387096774, + "grad_norm": 0.5172263979911804, + "learning_rate": 1.6576255483873686e-05, + "loss": 0.0578, + "step": 282 + }, + { + "epoch": 1.8258064516129031, + "grad_norm": 0.4233238995075226, + "learning_rate": 1.6509662156362196e-05, + "loss": 0.0547, + "step": 283 + }, + { + "epoch": 1.832258064516129, + "grad_norm": 0.45361143350601196, + "learning_rate": 1.6442941731065697e-05, + "loss": 0.0512, + "step": 284 + }, + { + "epoch": 1.838709677419355, + "grad_norm": 0.5802233219146729, + "learning_rate": 1.637609632288014e-05, + "loss": 0.0596, + "step": 285 + }, + { + "epoch": 1.8451612903225807, + "grad_norm": 0.5369323492050171, + "learning_rate": 1.630912805066317e-05, + "loss": 0.0646, + "step": 286 + }, + { + "epoch": 1.8516129032258064, + "grad_norm": 0.45122525095939636, + "learning_rate": 1.6242039037166977e-05, + "loss": 0.0517, + "step": 287 + }, + { + "epoch": 1.8580645161290321, + "grad_norm": 0.39205196499824524, + "learning_rate": 1.6174831408970964e-05, + "loss": 0.0491, + "step": 288 + }, + { + "epoch": 1.864516129032258, + "grad_norm": 0.4472959637641907, + "learning_rate": 1.6107507296414383e-05, + "loss": 0.049, + "step": 289 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 0.41624531149864197, + "learning_rate": 1.6040068833528797e-05, + "loss": 0.0483, + "step": 290 + }, + { + "epoch": 1.8774193548387097, + "grad_norm": 0.43875551223754883, + "learning_rate": 1.597251815797044e-05, + "loss": 0.0487, + "step": 291 + }, + { + "epoch": 1.8838709677419354, + "grad_norm": 0.4978736937046051, + "learning_rate": 1.5904857410952417e-05, + "loss": 0.0573, + "step": 292 + }, + { + "epoch": 1.8903225806451613, + "grad_norm": 0.5798497796058655, + "learning_rate": 1.5837088737176896e-05, + "loss": 0.0683, + "step": 293 + }, + { + "epoch": 1.896774193548387, + "grad_norm": 0.7377052903175354, + "learning_rate": 1.5769214284767086e-05, + "loss": 0.0583, + "step": 294 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 0.4153827428817749, + "learning_rate": 1.570123620519915e-05, + "loss": 0.0543, + "step": 295 + }, + { + "epoch": 1.9096774193548387, + "grad_norm": 0.4852810800075531, + "learning_rate": 1.563315665323401e-05, + "loss": 0.0636, + "step": 296 + }, + { + "epoch": 1.9161290322580644, + "grad_norm": 0.5545767545700073, + "learning_rate": 1.5564977786849055e-05, + "loss": 0.062, + "step": 297 + }, + { + "epoch": 1.9225806451612903, + "grad_norm": 0.4363822937011719, + "learning_rate": 1.549670176716973e-05, + "loss": 0.0516, + "step": 298 + }, + { + "epoch": 1.9290322580645163, + "grad_norm": 0.5309383273124695, + "learning_rate": 1.5428330758401027e-05, + "loss": 0.0647, + "step": 299 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 0.6617056131362915, + "learning_rate": 1.53598669277589e-05, + "loss": 0.0641, + "step": 300 + }, + { + "epoch": 1.9419354838709677, + "grad_norm": 0.49968254566192627, + "learning_rate": 1.529131244540155e-05, + "loss": 0.0585, + "step": 301 + }, + { + "epoch": 1.9483870967741934, + "grad_norm": 0.40158751606941223, + "learning_rate": 1.5222669484360644e-05, + "loss": 0.0537, + "step": 302 + }, + { + "epoch": 1.9548387096774194, + "grad_norm": 0.4537198543548584, + "learning_rate": 1.5153940220472451e-05, + "loss": 0.0511, + "step": 303 + }, + { + "epoch": 1.9612903225806453, + "grad_norm": 0.47163766622543335, + "learning_rate": 1.5085126832308843e-05, + "loss": 0.0532, + "step": 304 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 0.4738634526729584, + "learning_rate": 1.5016231501108253e-05, + "loss": 0.0615, + "step": 305 + }, + { + "epoch": 1.9741935483870967, + "grad_norm": 0.38560569286346436, + "learning_rate": 1.494725641070654e-05, + "loss": 0.0522, + "step": 306 + }, + { + "epoch": 1.9806451612903224, + "grad_norm": 0.5569445490837097, + "learning_rate": 1.4878203747467764e-05, + "loss": 0.0731, + "step": 307 + }, + { + "epoch": 1.9870967741935484, + "grad_norm": 0.38958773016929626, + "learning_rate": 1.480907570021487e-05, + "loss": 0.0461, + "step": 308 + }, + { + "epoch": 1.9935483870967743, + "grad_norm": 0.4473820924758911, + "learning_rate": 1.4739874460160316e-05, + "loss": 0.0555, + "step": 309 + }, + { + "epoch": 2.0, + "grad_norm": 0.39891934394836426, + "learning_rate": 1.4670602220836633e-05, + "loss": 0.051, + "step": 310 + }, + { + "epoch": 2.0064516129032257, + "grad_norm": 0.474127858877182, + "learning_rate": 1.4601261178026854e-05, + "loss": 0.0401, + "step": 311 + }, + { + "epoch": 2.0129032258064514, + "grad_norm": 0.3391839563846588, + "learning_rate": 1.4531853529694956e-05, + "loss": 0.0333, + "step": 312 + }, + { + "epoch": 2.0193548387096776, + "grad_norm": 0.3230273723602295, + "learning_rate": 1.446238147591616e-05, + "loss": 0.0282, + "step": 313 + }, + { + "epoch": 2.0258064516129033, + "grad_norm": 0.3246399462223053, + "learning_rate": 1.439284721880721e-05, + "loss": 0.0345, + "step": 314 + }, + { + "epoch": 2.032258064516129, + "grad_norm": 0.41817039251327515, + "learning_rate": 1.4323252962456554e-05, + "loss": 0.0288, + "step": 315 + }, + { + "epoch": 2.0387096774193547, + "grad_norm": 0.48674166202545166, + "learning_rate": 1.4253600912854497e-05, + "loss": 0.0354, + "step": 316 + }, + { + "epoch": 2.0451612903225804, + "grad_norm": 0.42214757204055786, + "learning_rate": 1.4183893277823265e-05, + "loss": 0.0388, + "step": 317 + }, + { + "epoch": 2.0516129032258066, + "grad_norm": 0.5475701093673706, + "learning_rate": 1.411413226694702e-05, + "loss": 0.0294, + "step": 318 + }, + { + "epoch": 2.0580645161290323, + "grad_norm": 0.5432962775230408, + "learning_rate": 1.4044320091501834e-05, + "loss": 0.0372, + "step": 319 + }, + { + "epoch": 2.064516129032258, + "grad_norm": 0.49539855122566223, + "learning_rate": 1.3974458964385579e-05, + "loss": 0.0425, + "step": 320 + }, + { + "epoch": 2.0709677419354837, + "grad_norm": 0.340425044298172, + "learning_rate": 1.3904551100047791e-05, + "loss": 0.026, + "step": 321 + }, + { + "epoch": 2.07741935483871, + "grad_norm": 0.4815217852592468, + "learning_rate": 1.3834598714419486e-05, + "loss": 0.0352, + "step": 322 + }, + { + "epoch": 2.0838709677419356, + "grad_norm": 0.4457317888736725, + "learning_rate": 1.3764604024842903e-05, + "loss": 0.028, + "step": 323 + }, + { + "epoch": 2.0903225806451613, + "grad_norm": 0.45776546001434326, + "learning_rate": 1.369456925000123e-05, + "loss": 0.0287, + "step": 324 + }, + { + "epoch": 2.096774193548387, + "grad_norm": 0.3825792968273163, + "learning_rate": 1.362449660984826e-05, + "loss": 0.0257, + "step": 325 + }, + { + "epoch": 2.1032258064516127, + "grad_norm": 0.44209763407707214, + "learning_rate": 1.3554388325538059e-05, + "loss": 0.0274, + "step": 326 + }, + { + "epoch": 2.109677419354839, + "grad_norm": 0.37732046842575073, + "learning_rate": 1.3484246619354524e-05, + "loss": 0.0263, + "step": 327 + }, + { + "epoch": 2.1161290322580646, + "grad_norm": 0.4975365698337555, + "learning_rate": 1.3414073714640951e-05, + "loss": 0.0294, + "step": 328 + }, + { + "epoch": 2.1225806451612903, + "grad_norm": 0.37548768520355225, + "learning_rate": 1.3343871835729565e-05, + "loss": 0.0261, + "step": 329 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 0.722154438495636, + "learning_rate": 1.3273643207871025e-05, + "loss": 0.0296, + "step": 330 + }, + { + "epoch": 2.135483870967742, + "grad_norm": 0.513611912727356, + "learning_rate": 1.3203390057163855e-05, + "loss": 0.0326, + "step": 331 + }, + { + "epoch": 2.141935483870968, + "grad_norm": 0.43579375743865967, + "learning_rate": 1.3133114610483909e-05, + "loss": 0.035, + "step": 332 + }, + { + "epoch": 2.1483870967741936, + "grad_norm": 0.4927336275577545, + "learning_rate": 1.3062819095413786e-05, + "loss": 0.0358, + "step": 333 + }, + { + "epoch": 2.1548387096774193, + "grad_norm": 0.43542489409446716, + "learning_rate": 1.2992505740172196e-05, + "loss": 0.035, + "step": 334 + }, + { + "epoch": 2.161290322580645, + "grad_norm": 0.34009236097335815, + "learning_rate": 1.2922176773543355e-05, + "loss": 0.0264, + "step": 335 + }, + { + "epoch": 2.167741935483871, + "grad_norm": 0.4710192084312439, + "learning_rate": 1.2851834424806314e-05, + "loss": 0.0403, + "step": 336 + }, + { + "epoch": 2.174193548387097, + "grad_norm": 0.8653304576873779, + "learning_rate": 1.2781480923664326e-05, + "loss": 0.0839, + "step": 337 + }, + { + "epoch": 2.1806451612903226, + "grad_norm": 0.7528795599937439, + "learning_rate": 1.2711118500174138e-05, + "loss": 0.0488, + "step": 338 + }, + { + "epoch": 2.1870967741935483, + "grad_norm": 0.5551451444625854, + "learning_rate": 1.2640749384675324e-05, + "loss": 0.0223, + "step": 339 + }, + { + "epoch": 2.193548387096774, + "grad_norm": 0.42200708389282227, + "learning_rate": 1.2570375807719576e-05, + "loss": 0.0305, + "step": 340 + }, + { + "epoch": 2.2, + "grad_norm": 0.5258976817131042, + "learning_rate": 1.25e-05, + "loss": 0.0455, + "step": 341 + }, + { + "epoch": 2.206451612903226, + "grad_norm": 0.495807945728302, + "learning_rate": 1.242962419228043e-05, + "loss": 0.0323, + "step": 342 + }, + { + "epoch": 2.2129032258064516, + "grad_norm": 0.5464356541633606, + "learning_rate": 1.2359250615324678e-05, + "loss": 0.0325, + "step": 343 + }, + { + "epoch": 2.2193548387096773, + "grad_norm": 0.5555934906005859, + "learning_rate": 1.2288881499825863e-05, + "loss": 0.0504, + "step": 344 + }, + { + "epoch": 2.225806451612903, + "grad_norm": 0.41927701234817505, + "learning_rate": 1.2218519076335677e-05, + "loss": 0.0288, + "step": 345 + }, + { + "epoch": 2.232258064516129, + "grad_norm": 0.5449569821357727, + "learning_rate": 1.2148165575193685e-05, + "loss": 0.0328, + "step": 346 + }, + { + "epoch": 2.238709677419355, + "grad_norm": 0.4198172688484192, + "learning_rate": 1.2077823226456648e-05, + "loss": 0.0284, + "step": 347 + }, + { + "epoch": 2.2451612903225806, + "grad_norm": 0.5396814346313477, + "learning_rate": 1.2007494259827809e-05, + "loss": 0.0379, + "step": 348 + }, + { + "epoch": 2.2516129032258063, + "grad_norm": 0.4842919409275055, + "learning_rate": 1.1937180904586215e-05, + "loss": 0.0316, + "step": 349 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 0.5152572989463806, + "learning_rate": 1.1866885389516092e-05, + "loss": 0.0321, + "step": 350 + }, + { + "epoch": 2.264516129032258, + "grad_norm": 0.556614875793457, + "learning_rate": 1.179660994283615e-05, + "loss": 0.0372, + "step": 351 + }, + { + "epoch": 2.270967741935484, + "grad_norm": 0.5159235000610352, + "learning_rate": 1.1726356792128978e-05, + "loss": 0.0328, + "step": 352 + }, + { + "epoch": 2.2774193548387096, + "grad_norm": 0.5564429759979248, + "learning_rate": 1.1656128164270436e-05, + "loss": 0.0304, + "step": 353 + }, + { + "epoch": 2.2838709677419353, + "grad_norm": 0.6227903366088867, + "learning_rate": 1.1585926285359049e-05, + "loss": 0.0321, + "step": 354 + }, + { + "epoch": 2.2903225806451615, + "grad_norm": 0.5218878388404846, + "learning_rate": 1.1515753380645479e-05, + "loss": 0.0358, + "step": 355 + }, + { + "epoch": 2.296774193548387, + "grad_norm": 0.49731266498565674, + "learning_rate": 1.1445611674461942e-05, + "loss": 0.0331, + "step": 356 + }, + { + "epoch": 2.303225806451613, + "grad_norm": 0.5095941424369812, + "learning_rate": 1.1375503390151737e-05, + "loss": 0.0315, + "step": 357 + }, + { + "epoch": 2.3096774193548386, + "grad_norm": 0.4576358199119568, + "learning_rate": 1.1305430749998775e-05, + "loss": 0.0304, + "step": 358 + }, + { + "epoch": 2.3161290322580643, + "grad_norm": 0.5103798508644104, + "learning_rate": 1.12353959751571e-05, + "loss": 0.0281, + "step": 359 + }, + { + "epoch": 2.3225806451612905, + "grad_norm": 0.5072308778762817, + "learning_rate": 1.1165401285580515e-05, + "loss": 0.0267, + "step": 360 + }, + { + "epoch": 2.329032258064516, + "grad_norm": 0.45558691024780273, + "learning_rate": 1.1095448899952212e-05, + "loss": 0.0302, + "step": 361 + }, + { + "epoch": 2.335483870967742, + "grad_norm": 0.4773171842098236, + "learning_rate": 1.1025541035614427e-05, + "loss": 0.0307, + "step": 362 + }, + { + "epoch": 2.3419354838709676, + "grad_norm": 0.4630301892757416, + "learning_rate": 1.0955679908498171e-05, + "loss": 0.0292, + "step": 363 + }, + { + "epoch": 2.3483870967741938, + "grad_norm": 0.5814460515975952, + "learning_rate": 1.0885867733052985e-05, + "loss": 0.034, + "step": 364 + }, + { + "epoch": 2.3548387096774195, + "grad_norm": 0.3135308623313904, + "learning_rate": 1.0816106722176741e-05, + "loss": 0.0264, + "step": 365 + }, + { + "epoch": 2.361290322580645, + "grad_norm": 0.4219888150691986, + "learning_rate": 1.0746399087145504e-05, + "loss": 0.0304, + "step": 366 + }, + { + "epoch": 2.367741935483871, + "grad_norm": 0.4246158003807068, + "learning_rate": 1.0676747037543447e-05, + "loss": 0.032, + "step": 367 + }, + { + "epoch": 2.3741935483870966, + "grad_norm": 0.4565359950065613, + "learning_rate": 1.0607152781192796e-05, + "loss": 0.0326, + "step": 368 + }, + { + "epoch": 2.3806451612903228, + "grad_norm": 0.4495943486690521, + "learning_rate": 1.053761852408384e-05, + "loss": 0.0307, + "step": 369 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 0.47505924105644226, + "learning_rate": 1.0468146470305047e-05, + "loss": 0.0366, + "step": 370 + }, + { + "epoch": 2.393548387096774, + "grad_norm": 0.41802337765693665, + "learning_rate": 1.039873882197315e-05, + "loss": 0.0242, + "step": 371 + }, + { + "epoch": 2.4, + "grad_norm": 0.4308302104473114, + "learning_rate": 1.0329397779163372e-05, + "loss": 0.0303, + "step": 372 + }, + { + "epoch": 2.4064516129032256, + "grad_norm": 0.3776704967021942, + "learning_rate": 1.0260125539839686e-05, + "loss": 0.0224, + "step": 373 + }, + { + "epoch": 2.412903225806452, + "grad_norm": 0.3952430188655853, + "learning_rate": 1.0190924299785138e-05, + "loss": 0.0236, + "step": 374 + }, + { + "epoch": 2.4193548387096775, + "grad_norm": 0.5212628841400146, + "learning_rate": 1.0121796252532237e-05, + "loss": 0.0352, + "step": 375 + }, + { + "epoch": 2.425806451612903, + "grad_norm": 0.5264010429382324, + "learning_rate": 1.0052743589293463e-05, + "loss": 0.0366, + "step": 376 + }, + { + "epoch": 2.432258064516129, + "grad_norm": 0.42148974537849426, + "learning_rate": 9.983768498891747e-06, + "loss": 0.0281, + "step": 377 + }, + { + "epoch": 2.4387096774193546, + "grad_norm": 0.4387865960597992, + "learning_rate": 9.91487316769116e-06, + "loss": 0.0321, + "step": 378 + }, + { + "epoch": 2.445161290322581, + "grad_norm": 0.4530801475048065, + "learning_rate": 9.846059779527552e-06, + "loss": 0.03, + "step": 379 + }, + { + "epoch": 2.4516129032258065, + "grad_norm": 0.44786474108695984, + "learning_rate": 9.777330515639356e-06, + "loss": 0.0312, + "step": 380 + }, + { + "epoch": 2.458064516129032, + "grad_norm": 0.42808324098587036, + "learning_rate": 9.708687554598454e-06, + "loss": 0.0321, + "step": 381 + }, + { + "epoch": 2.464516129032258, + "grad_norm": 0.4658293128013611, + "learning_rate": 9.640133072241105e-06, + "loss": 0.0335, + "step": 382 + }, + { + "epoch": 2.4709677419354836, + "grad_norm": 0.45854416489601135, + "learning_rate": 9.571669241598974e-06, + "loss": 0.0306, + "step": 383 + }, + { + "epoch": 2.47741935483871, + "grad_norm": 0.5602400302886963, + "learning_rate": 9.503298232830274e-06, + "loss": 0.0425, + "step": 384 + }, + { + "epoch": 2.4838709677419355, + "grad_norm": 0.4135296940803528, + "learning_rate": 9.43502221315095e-06, + "loss": 0.0317, + "step": 385 + }, + { + "epoch": 2.490322580645161, + "grad_norm": 0.6756112575531006, + "learning_rate": 9.366843346765992e-06, + "loss": 0.0492, + "step": 386 + }, + { + "epoch": 2.496774193548387, + "grad_norm": 0.6048617362976074, + "learning_rate": 9.298763794800856e-06, + "loss": 0.0326, + "step": 387 + }, + { + "epoch": 2.5032258064516126, + "grad_norm": 0.3737858831882477, + "learning_rate": 9.230785715232917e-06, + "loss": 0.0226, + "step": 388 + }, + { + "epoch": 2.509677419354839, + "grad_norm": 0.49958306550979614, + "learning_rate": 9.162911262823104e-06, + "loss": 0.0293, + "step": 389 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 0.4132345914840698, + "learning_rate": 9.095142589047586e-06, + "loss": 0.0268, + "step": 390 + }, + { + "epoch": 2.52258064516129, + "grad_norm": 0.5339500308036804, + "learning_rate": 9.027481842029567e-06, + "loss": 0.0308, + "step": 391 + }, + { + "epoch": 2.5290322580645164, + "grad_norm": 0.5680338740348816, + "learning_rate": 8.9599311664712e-06, + "loss": 0.026, + "step": 392 + }, + { + "epoch": 2.535483870967742, + "grad_norm": 0.4945621192455292, + "learning_rate": 8.89249270358562e-06, + "loss": 0.0414, + "step": 393 + }, + { + "epoch": 2.541935483870968, + "grad_norm": 0.478188157081604, + "learning_rate": 8.825168591029042e-06, + "loss": 0.0325, + "step": 394 + }, + { + "epoch": 2.5483870967741935, + "grad_norm": 0.41539856791496277, + "learning_rate": 8.757960962833026e-06, + "loss": 0.0276, + "step": 395 + }, + { + "epoch": 2.554838709677419, + "grad_norm": 0.41548025608062744, + "learning_rate": 8.69087194933683e-06, + "loss": 0.0258, + "step": 396 + }, + { + "epoch": 2.5612903225806454, + "grad_norm": 0.7209835052490234, + "learning_rate": 8.623903677119866e-06, + "loss": 0.0275, + "step": 397 + }, + { + "epoch": 2.567741935483871, + "grad_norm": 0.45113834738731384, + "learning_rate": 8.557058268934306e-06, + "loss": 0.0276, + "step": 398 + }, + { + "epoch": 2.574193548387097, + "grad_norm": 0.4919924736022949, + "learning_rate": 8.490337843637807e-06, + "loss": 0.0352, + "step": 399 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 0.4441167414188385, + "learning_rate": 8.423744516126313e-06, + "loss": 0.0312, + "step": 400 + }, + { + "epoch": 2.587096774193548, + "grad_norm": 0.3870048522949219, + "learning_rate": 8.357280397267054e-06, + "loss": 0.0273, + "step": 401 + }, + { + "epoch": 2.5935483870967744, + "grad_norm": 0.4747593104839325, + "learning_rate": 8.29094759383161e-06, + "loss": 0.0428, + "step": 402 + }, + { + "epoch": 2.6, + "grad_norm": 0.3494237959384918, + "learning_rate": 8.224748208429142e-06, + "loss": 0.0249, + "step": 403 + }, + { + "epoch": 2.606451612903226, + "grad_norm": 0.3618505001068115, + "learning_rate": 8.158684339439748e-06, + "loss": 0.0221, + "step": 404 + }, + { + "epoch": 2.6129032258064515, + "grad_norm": 0.45744070410728455, + "learning_rate": 8.092758080947939e-06, + "loss": 0.0303, + "step": 405 + }, + { + "epoch": 2.6193548387096772, + "grad_norm": 0.3921363055706024, + "learning_rate": 8.02697152267626e-06, + "loss": 0.0267, + "step": 406 + }, + { + "epoch": 2.6258064516129034, + "grad_norm": 0.5149343013763428, + "learning_rate": 7.961326749919069e-06, + "loss": 0.0347, + "step": 407 + }, + { + "epoch": 2.632258064516129, + "grad_norm": 0.5246243476867676, + "learning_rate": 7.895825843476412e-06, + "loss": 0.0318, + "step": 408 + }, + { + "epoch": 2.638709677419355, + "grad_norm": 0.5338672995567322, + "learning_rate": 7.83047087958808e-06, + "loss": 0.0331, + "step": 409 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 0.4028920531272888, + "learning_rate": 7.7652639298678e-06, + "loss": 0.0251, + "step": 410 + }, + { + "epoch": 2.6516129032258062, + "grad_norm": 0.3391985297203064, + "learning_rate": 7.70020706123756e-06, + "loss": 0.0206, + "step": 411 + }, + { + "epoch": 2.6580645161290324, + "grad_norm": 0.4651046097278595, + "learning_rate": 7.635302335862094e-06, + "loss": 0.0242, + "step": 412 + }, + { + "epoch": 2.664516129032258, + "grad_norm": 0.4581477642059326, + "learning_rate": 7.570551811083521e-06, + "loss": 0.0334, + "step": 413 + }, + { + "epoch": 2.670967741935484, + "grad_norm": 0.629748523235321, + "learning_rate": 7.505957539356126e-06, + "loss": 0.0426, + "step": 414 + }, + { + "epoch": 2.6774193548387095, + "grad_norm": 0.44972798228263855, + "learning_rate": 7.441521568181299e-06, + "loss": 0.0279, + "step": 415 + }, + { + "epoch": 2.6838709677419352, + "grad_norm": 0.49497148394584656, + "learning_rate": 7.37724594004266e-06, + "loss": 0.0331, + "step": 416 + }, + { + "epoch": 2.6903225806451614, + "grad_norm": 0.4186260998249054, + "learning_rate": 7.313132692341263e-06, + "loss": 0.0294, + "step": 417 + }, + { + "epoch": 2.696774193548387, + "grad_norm": 0.4715961813926697, + "learning_rate": 7.249183857331064e-06, + "loss": 0.0293, + "step": 418 + }, + { + "epoch": 2.703225806451613, + "grad_norm": 0.48064178228378296, + "learning_rate": 7.185401462054495e-06, + "loss": 0.0312, + "step": 419 + }, + { + "epoch": 2.709677419354839, + "grad_norm": 0.4826470613479614, + "learning_rate": 7.121787528278177e-06, + "loss": 0.0303, + "step": 420 + }, + { + "epoch": 2.7161290322580647, + "grad_norm": 0.39333951473236084, + "learning_rate": 7.058344072428877e-06, + "loss": 0.0211, + "step": 421 + }, + { + "epoch": 2.7225806451612904, + "grad_norm": 0.3964556157588959, + "learning_rate": 6.99507310552957e-06, + "loss": 0.031, + "step": 422 + }, + { + "epoch": 2.729032258064516, + "grad_norm": 0.5450259447097778, + "learning_rate": 6.931976633135695e-06, + "loss": 0.0344, + "step": 423 + }, + { + "epoch": 2.735483870967742, + "grad_norm": 0.4331640601158142, + "learning_rate": 6.869056655271588e-06, + "loss": 0.0261, + "step": 424 + }, + { + "epoch": 2.741935483870968, + "grad_norm": 0.46446603536605835, + "learning_rate": 6.806315166367075e-06, + "loss": 0.0311, + "step": 425 + }, + { + "epoch": 2.7483870967741937, + "grad_norm": 0.5200790166854858, + "learning_rate": 6.743754155194268e-06, + "loss": 0.0292, + "step": 426 + }, + { + "epoch": 2.7548387096774194, + "grad_norm": 0.6154363751411438, + "learning_rate": 6.681375604804521e-06, + "loss": 0.0252, + "step": 427 + }, + { + "epoch": 2.761290322580645, + "grad_norm": 0.43054288625717163, + "learning_rate": 6.619181492465557e-06, + "loss": 0.0225, + "step": 428 + }, + { + "epoch": 2.767741935483871, + "grad_norm": 0.5042747259140015, + "learning_rate": 6.55717378959881e-06, + "loss": 0.0266, + "step": 429 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 0.5118414759635925, + "learning_rate": 6.4953544617169376e-06, + "loss": 0.0324, + "step": 430 + }, + { + "epoch": 2.7806451612903227, + "grad_norm": 0.4071415364742279, + "learning_rate": 6.43372546836149e-06, + "loss": 0.0306, + "step": 431 + }, + { + "epoch": 2.7870967741935484, + "grad_norm": 0.43498843908309937, + "learning_rate": 6.372288763040833e-06, + "loss": 0.0267, + "step": 432 + }, + { + "epoch": 2.793548387096774, + "grad_norm": 0.6407294273376465, + "learning_rate": 6.3110462931682075e-06, + "loss": 0.0529, + "step": 433 + }, + { + "epoch": 2.8, + "grad_norm": 0.4058496356010437, + "learning_rate": 6.250000000000003e-06, + "loss": 0.0289, + "step": 434 + }, + { + "epoch": 2.806451612903226, + "grad_norm": 0.34818795323371887, + "learning_rate": 6.1891518185742116e-06, + "loss": 0.0236, + "step": 435 + }, + { + "epoch": 2.8129032258064517, + "grad_norm": 0.4517665505409241, + "learning_rate": 6.1285036776491165e-06, + "loss": 0.0341, + "step": 436 + }, + { + "epoch": 2.8193548387096774, + "grad_norm": 0.5423181056976318, + "learning_rate": 6.068057499642144e-06, + "loss": 0.0406, + "step": 437 + }, + { + "epoch": 2.825806451612903, + "grad_norm": 0.4574117362499237, + "learning_rate": 6.007815200568906e-06, + "loss": 0.0344, + "step": 438 + }, + { + "epoch": 2.832258064516129, + "grad_norm": 0.4028095006942749, + "learning_rate": 5.9477786899825024e-06, + "loss": 0.026, + "step": 439 + }, + { + "epoch": 2.838709677419355, + "grad_norm": 0.4277281165122986, + "learning_rate": 5.8879498709129735e-06, + "loss": 0.032, + "step": 440 + }, + { + "epoch": 2.8451612903225807, + "grad_norm": 0.4217607080936432, + "learning_rate": 5.82833063980696e-06, + "loss": 0.0275, + "step": 441 + }, + { + "epoch": 2.8516129032258064, + "grad_norm": 0.4865557849407196, + "learning_rate": 5.7689228864676394e-06, + "loss": 0.0344, + "step": 442 + }, + { + "epoch": 2.858064516129032, + "grad_norm": 0.44111689925193787, + "learning_rate": 5.70972849399477e-06, + "loss": 0.028, + "step": 443 + }, + { + "epoch": 2.864516129032258, + "grad_norm": 0.5612359046936035, + "learning_rate": 5.650749338725019e-06, + "loss": 0.0433, + "step": 444 + }, + { + "epoch": 2.870967741935484, + "grad_norm": 0.42652663588523865, + "learning_rate": 5.591987290172518e-06, + "loss": 0.0266, + "step": 445 + }, + { + "epoch": 2.8774193548387097, + "grad_norm": 0.43139341473579407, + "learning_rate": 5.533444210969546e-06, + "loss": 0.0228, + "step": 446 + }, + { + "epoch": 2.8838709677419354, + "grad_norm": 0.3348155915737152, + "learning_rate": 5.475121956807537e-06, + "loss": 0.0236, + "step": 447 + }, + { + "epoch": 2.8903225806451616, + "grad_norm": 0.41803082823753357, + "learning_rate": 5.417022376378239e-06, + "loss": 0.0275, + "step": 448 + }, + { + "epoch": 2.896774193548387, + "grad_norm": 0.4131038784980774, + "learning_rate": 5.359147311315094e-06, + "loss": 0.0265, + "step": 449 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 0.5227479934692383, + "learning_rate": 5.30149859613492e-06, + "loss": 0.025, + "step": 450 + }, + { + "epoch": 2.9096774193548387, + "grad_norm": 0.43142953515052795, + "learning_rate": 5.244078058179691e-06, + "loss": 0.0249, + "step": 451 + }, + { + "epoch": 2.9161290322580644, + "grad_norm": 0.4158158600330353, + "learning_rate": 5.186887517558653e-06, + "loss": 0.0297, + "step": 452 + }, + { + "epoch": 2.9225806451612906, + "grad_norm": 0.3552153706550598, + "learning_rate": 5.129928787090646e-06, + "loss": 0.0234, + "step": 453 + }, + { + "epoch": 2.9290322580645163, + "grad_norm": 0.49204781651496887, + "learning_rate": 5.073203672246593e-06, + "loss": 0.0379, + "step": 454 + }, + { + "epoch": 2.935483870967742, + "grad_norm": 0.38140571117401123, + "learning_rate": 5.016713971092311e-06, + "loss": 0.0294, + "step": 455 + }, + { + "epoch": 2.9419354838709677, + "grad_norm": 0.5261517763137817, + "learning_rate": 4.960461474231505e-06, + "loss": 0.0305, + "step": 456 + }, + { + "epoch": 2.9483870967741934, + "grad_norm": 0.6391315460205078, + "learning_rate": 4.904447964748993e-06, + "loss": 0.038, + "step": 457 + }, + { + "epoch": 2.9548387096774196, + "grad_norm": 0.3812016546726227, + "learning_rate": 4.848675218154214e-06, + "loss": 0.0259, + "step": 458 + }, + { + "epoch": 2.9612903225806453, + "grad_norm": 0.4748527407646179, + "learning_rate": 4.793145002324933e-06, + "loss": 0.0329, + "step": 459 + }, + { + "epoch": 2.967741935483871, + "grad_norm": 0.4919755458831787, + "learning_rate": 4.737859077451191e-06, + "loss": 0.0253, + "step": 460 + }, + { + "epoch": 2.9741935483870967, + "grad_norm": 0.4986102879047394, + "learning_rate": 4.68281919597954e-06, + "loss": 0.0293, + "step": 461 + }, + { + "epoch": 2.9806451612903224, + "grad_norm": 0.48589223623275757, + "learning_rate": 4.6280271025574695e-06, + "loss": 0.0287, + "step": 462 + }, + { + "epoch": 2.9870967741935486, + "grad_norm": 0.4930824935436249, + "learning_rate": 4.573484533978119e-06, + "loss": 0.0258, + "step": 463 + }, + { + "epoch": 2.9935483870967743, + "grad_norm": 0.38358667492866516, + "learning_rate": 4.5191932191252075e-06, + "loss": 0.0235, + "step": 464 + }, + { + "epoch": 3.0, + "grad_norm": 0.5300599336624146, + "learning_rate": 4.465154878918258e-06, + "loss": 0.0309, + "step": 465 + }, + { + "epoch": 3.0064516129032257, + "grad_norm": 0.24884271621704102, + "learning_rate": 4.411371226258032e-06, + "loss": 0.0182, + "step": 466 + }, + { + "epoch": 3.0129032258064514, + "grad_norm": 0.3061859607696533, + "learning_rate": 4.3578439659722246e-06, + "loss": 0.0161, + "step": 467 + }, + { + "epoch": 3.0193548387096776, + "grad_norm": 0.3014248311519623, + "learning_rate": 4.304574794761447e-06, + "loss": 0.0135, + "step": 468 + }, + { + "epoch": 3.0258064516129033, + "grad_norm": 0.31640511751174927, + "learning_rate": 4.251565401145432e-06, + "loss": 0.0168, + "step": 469 + }, + { + "epoch": 3.032258064516129, + "grad_norm": 0.25447705388069153, + "learning_rate": 4.1988174654095104e-06, + "loss": 0.0117, + "step": 470 + }, + { + "epoch": 3.0387096774193547, + "grad_norm": 0.31153520941734314, + "learning_rate": 4.146332659551364e-06, + "loss": 0.0147, + "step": 471 + }, + { + "epoch": 3.0451612903225804, + "grad_norm": 0.32956749200820923, + "learning_rate": 4.094112647227996e-06, + "loss": 0.015, + "step": 472 + }, + { + "epoch": 3.0516129032258066, + "grad_norm": 0.3105918765068054, + "learning_rate": 4.042159083703031e-06, + "loss": 0.0136, + "step": 473 + }, + { + "epoch": 3.0580645161290323, + "grad_norm": 0.3890332281589508, + "learning_rate": 3.9904736157942355e-06, + "loss": 0.0128, + "step": 474 + }, + { + "epoch": 3.064516129032258, + "grad_norm": 0.2500901222229004, + "learning_rate": 3.939057881821295e-06, + "loss": 0.0109, + "step": 475 + }, + { + "epoch": 3.0709677419354837, + "grad_norm": 0.33506497740745544, + "learning_rate": 3.887913511553917e-06, + "loss": 0.0138, + "step": 476 + }, + { + "epoch": 3.07741935483871, + "grad_norm": 0.35200193524360657, + "learning_rate": 3.837042126160157e-06, + "loss": 0.0163, + "step": 477 + }, + { + "epoch": 3.0838709677419356, + "grad_norm": 0.33882763981819153, + "learning_rate": 3.786445338155013e-06, + "loss": 0.0125, + "step": 478 + }, + { + "epoch": 3.0903225806451613, + "grad_norm": 0.4239828586578369, + "learning_rate": 3.736124751349343e-06, + "loss": 0.0146, + "step": 479 + }, + { + "epoch": 3.096774193548387, + "grad_norm": 0.39569729566574097, + "learning_rate": 3.6860819607990108e-06, + "loss": 0.0149, + "step": 480 + }, + { + "epoch": 3.1032258064516127, + "grad_norm": 0.4424724578857422, + "learning_rate": 3.6363185527543156e-06, + "loss": 0.0147, + "step": 481 + }, + { + "epoch": 3.109677419354839, + "grad_norm": 0.54300457239151, + "learning_rate": 3.5868361046097475e-06, + "loss": 0.0166, + "step": 482 + }, + { + "epoch": 3.1161290322580646, + "grad_norm": 0.41813674569129944, + "learning_rate": 3.537636184853939e-06, + "loss": 0.0131, + "step": 483 + }, + { + "epoch": 3.1225806451612903, + "grad_norm": 0.4122736155986786, + "learning_rate": 3.4887203530199864e-06, + "loss": 0.0141, + "step": 484 + }, + { + "epoch": 3.129032258064516, + "grad_norm": 0.39058443903923035, + "learning_rate": 3.440090159636003e-06, + "loss": 0.0115, + "step": 485 + }, + { + "epoch": 3.135483870967742, + "grad_norm": 0.42365285754203796, + "learning_rate": 3.391747146175954e-06, + "loss": 0.0097, + "step": 486 + }, + { + "epoch": 3.141935483870968, + "grad_norm": 0.5885961055755615, + "learning_rate": 3.3436928450108264e-06, + "loss": 0.0219, + "step": 487 + }, + { + "epoch": 3.1483870967741936, + "grad_norm": 0.39843595027923584, + "learning_rate": 3.2959287793600356e-06, + "loss": 0.0098, + "step": 488 + }, + { + "epoch": 3.1548387096774193, + "grad_norm": 0.27729499340057373, + "learning_rate": 3.2484564632431396e-06, + "loss": 0.0068, + "step": 489 + }, + { + "epoch": 3.161290322580645, + "grad_norm": 0.4415301978588104, + "learning_rate": 3.2012774014318625e-06, + "loss": 0.0151, + "step": 490 + }, + { + "epoch": 3.167741935483871, + "grad_norm": 0.40353745222091675, + "learning_rate": 3.154393089402391e-06, + "loss": 0.016, + "step": 491 + }, + { + "epoch": 3.174193548387097, + "grad_norm": 0.4263345003128052, + "learning_rate": 3.107805013287958e-06, + "loss": 0.0112, + "step": 492 + }, + { + "epoch": 3.1806451612903226, + "grad_norm": 0.37084028124809265, + "learning_rate": 3.061514649831755e-06, + "loss": 0.014, + "step": 493 + }, + { + "epoch": 3.1870967741935483, + "grad_norm": 0.5308308005332947, + "learning_rate": 3.0155234663401146e-06, + "loss": 0.0146, + "step": 494 + }, + { + "epoch": 3.193548387096774, + "grad_norm": 0.47034651041030884, + "learning_rate": 2.9698329206359925e-06, + "loss": 0.0124, + "step": 495 + }, + { + "epoch": 3.2, + "grad_norm": 0.3541916012763977, + "learning_rate": 2.9244444610127764e-06, + "loss": 0.0121, + "step": 496 + }, + { + "epoch": 3.206451612903226, + "grad_norm": 0.4616714417934418, + "learning_rate": 2.8793595261883465e-06, + "loss": 0.0181, + "step": 497 + }, + { + "epoch": 3.2129032258064516, + "grad_norm": 0.36847707629203796, + "learning_rate": 2.8345795452595095e-06, + "loss": 0.0165, + "step": 498 + }, + { + "epoch": 3.2193548387096773, + "grad_norm": 0.3731675148010254, + "learning_rate": 2.790105937656673e-06, + "loss": 0.013, + "step": 499 + }, + { + "epoch": 3.225806451612903, + "grad_norm": 0.42840346693992615, + "learning_rate": 2.7459401130988534e-06, + "loss": 0.0109, + "step": 500 + }, + { + "epoch": 3.232258064516129, + "grad_norm": 0.33602991700172424, + "learning_rate": 2.7020834715490093e-06, + "loss": 0.0106, + "step": 501 + }, + { + "epoch": 3.238709677419355, + "grad_norm": 0.47426825761795044, + "learning_rate": 2.6585374031696474e-06, + "loss": 0.0133, + "step": 502 + }, + { + "epoch": 3.2451612903225806, + "grad_norm": 0.41371604800224304, + "learning_rate": 2.61530328827877e-06, + "loss": 0.0094, + "step": 503 + }, + { + "epoch": 3.2516129032258063, + "grad_norm": 0.4533410668373108, + "learning_rate": 2.5723824973061e-06, + "loss": 0.0123, + "step": 504 + }, + { + "epoch": 3.258064516129032, + "grad_norm": 0.2636722922325134, + "learning_rate": 2.5297763907496746e-06, + "loss": 0.0086, + "step": 505 + }, + { + "epoch": 3.264516129032258, + "grad_norm": 0.48444676399230957, + "learning_rate": 2.4874863191326953e-06, + "loss": 0.0169, + "step": 506 + }, + { + "epoch": 3.270967741935484, + "grad_norm": 0.5979859828948975, + "learning_rate": 2.44551362296072e-06, + "loss": 0.0112, + "step": 507 + }, + { + "epoch": 3.2774193548387096, + "grad_norm": 0.43151959776878357, + "learning_rate": 2.4038596326791884e-06, + "loss": 0.0109, + "step": 508 + }, + { + "epoch": 3.2838709677419353, + "grad_norm": 0.4825892746448517, + "learning_rate": 2.362525668631238e-06, + "loss": 0.013, + "step": 509 + }, + { + "epoch": 3.2903225806451615, + "grad_norm": 0.3168151080608368, + "learning_rate": 2.3215130410158424e-06, + "loss": 0.0106, + "step": 510 + }, + { + "epoch": 3.296774193548387, + "grad_norm": 0.4605632722377777, + "learning_rate": 2.2808230498463116e-06, + "loss": 0.0189, + "step": 511 + }, + { + "epoch": 3.303225806451613, + "grad_norm": 0.47640544176101685, + "learning_rate": 2.240456984909049e-06, + "loss": 0.015, + "step": 512 + }, + { + "epoch": 3.3096774193548386, + "grad_norm": 0.5328596234321594, + "learning_rate": 2.2004161257226805e-06, + "loss": 0.0201, + "step": 513 + }, + { + "epoch": 3.3161290322580643, + "grad_norm": 0.5342445969581604, + "learning_rate": 2.16070174149752e-06, + "loss": 0.0109, + "step": 514 + }, + { + "epoch": 3.3225806451612905, + "grad_norm": 0.5308839082717896, + "learning_rate": 2.121315091095297e-06, + "loss": 0.014, + "step": 515 + }, + { + "epoch": 3.329032258064516, + "grad_norm": 0.4669474959373474, + "learning_rate": 2.082257422989281e-06, + "loss": 0.0105, + "step": 516 + }, + { + "epoch": 3.335483870967742, + "grad_norm": 0.37382492423057556, + "learning_rate": 2.0435299752247077e-06, + "loss": 0.0132, + "step": 517 + }, + { + "epoch": 3.3419354838709676, + "grad_norm": 0.4566926062107086, + "learning_rate": 2.0051339753795125e-06, + "loss": 0.0159, + "step": 518 + }, + { + "epoch": 3.3483870967741938, + "grad_norm": 0.4399929642677307, + "learning_rate": 1.9670706405254548e-06, + "loss": 0.0149, + "step": 519 + }, + { + "epoch": 3.3548387096774195, + "grad_norm": 0.3071390986442566, + "learning_rate": 1.929341177189506e-06, + "loss": 0.0085, + "step": 520 + }, + { + "epoch": 3.361290322580645, + "grad_norm": 0.39541929960250854, + "learning_rate": 1.8919467813156121e-06, + "loss": 0.0088, + "step": 521 + }, + { + "epoch": 3.367741935483871, + "grad_norm": 0.49959710240364075, + "learning_rate": 1.854888638226815e-06, + "loss": 0.0147, + "step": 522 + }, + { + "epoch": 3.3741935483870966, + "grad_norm": 0.3740963339805603, + "learning_rate": 1.8181679225876324e-06, + "loss": 0.0099, + "step": 523 + }, + { + "epoch": 3.3806451612903228, + "grad_norm": 0.27066710591316223, + "learning_rate": 1.7817857983668612e-06, + "loss": 0.0071, + "step": 524 + }, + { + "epoch": 3.3870967741935485, + "grad_norm": 0.47990116477012634, + "learning_rate": 1.745743418800669e-06, + "loss": 0.012, + "step": 525 + }, + { + "epoch": 3.393548387096774, + "grad_norm": 0.40311211347579956, + "learning_rate": 1.7100419263560263e-06, + "loss": 0.0102, + "step": 526 + }, + { + "epoch": 3.4, + "grad_norm": 0.4134123623371124, + "learning_rate": 1.6746824526945163e-06, + "loss": 0.0123, + "step": 527 + }, + { + "epoch": 3.4064516129032256, + "grad_norm": 0.44432902336120605, + "learning_rate": 1.6396661186364543e-06, + "loss": 0.0106, + "step": 528 + }, + { + "epoch": 3.412903225806452, + "grad_norm": 0.4438421428203583, + "learning_rate": 1.6049940341253442e-06, + "loss": 0.0172, + "step": 529 + }, + { + "epoch": 3.4193548387096775, + "grad_norm": 0.5001305341720581, + "learning_rate": 1.570667298192724e-06, + "loss": 0.0115, + "step": 530 + }, + { + "epoch": 3.425806451612903, + "grad_norm": 0.36009618639945984, + "learning_rate": 1.5366869989233062e-06, + "loss": 0.0111, + "step": 531 + }, + { + "epoch": 3.432258064516129, + "grad_norm": 0.35556045174598694, + "learning_rate": 1.5030542134205003e-06, + "loss": 0.0129, + "step": 532 + }, + { + "epoch": 3.4387096774193546, + "grad_norm": 0.38800477981567383, + "learning_rate": 1.4697700077722616e-06, + "loss": 0.0111, + "step": 533 + }, + { + "epoch": 3.445161290322581, + "grad_norm": 0.35132497549057007, + "learning_rate": 1.4368354370173073e-06, + "loss": 0.0133, + "step": 534 + }, + { + "epoch": 3.4516129032258065, + "grad_norm": 0.49624019861221313, + "learning_rate": 1.404251545111672e-06, + "loss": 0.0152, + "step": 535 + }, + { + "epoch": 3.458064516129032, + "grad_norm": 0.34581199288368225, + "learning_rate": 1.3720193648956062e-06, + "loss": 0.0093, + "step": 536 + }, + { + "epoch": 3.464516129032258, + "grad_norm": 0.4548514783382416, + "learning_rate": 1.3401399180608551e-06, + "loss": 0.0174, + "step": 537 + }, + { + "epoch": 3.4709677419354836, + "grad_norm": 0.33973830938339233, + "learning_rate": 1.3086142151182605e-06, + "loss": 0.0143, + "step": 538 + }, + { + "epoch": 3.47741935483871, + "grad_norm": 0.3562283515930176, + "learning_rate": 1.2774432553657303e-06, + "loss": 0.0129, + "step": 539 + }, + { + "epoch": 3.4838709677419355, + "grad_norm": 0.42894405126571655, + "learning_rate": 1.2466280268565708e-06, + "loss": 0.0136, + "step": 540 + }, + { + "epoch": 3.490322580645161, + "grad_norm": 0.36266642808914185, + "learning_rate": 1.2161695063681589e-06, + "loss": 0.0152, + "step": 541 + }, + { + "epoch": 3.496774193548387, + "grad_norm": 0.41463732719421387, + "learning_rate": 1.186068659370984e-06, + "loss": 0.0126, + "step": 542 + }, + { + "epoch": 3.5032258064516126, + "grad_norm": 0.3517482876777649, + "learning_rate": 1.1563264399980512e-06, + "loss": 0.0106, + "step": 543 + }, + { + "epoch": 3.509677419354839, + "grad_norm": 0.3592299520969391, + "learning_rate": 1.1269437910146173e-06, + "loss": 0.01, + "step": 544 + }, + { + "epoch": 3.5161290322580645, + "grad_norm": 0.3486897051334381, + "learning_rate": 1.0979216437883327e-06, + "loss": 0.0132, + "step": 545 + }, + { + "epoch": 3.52258064516129, + "grad_norm": 0.3669939935207367, + "learning_rate": 1.069260918259704e-06, + "loss": 0.0108, + "step": 546 + }, + { + "epoch": 3.5290322580645164, + "grad_norm": 0.345688134431839, + "learning_rate": 1.0409625229129292e-06, + "loss": 0.0112, + "step": 547 + }, + { + "epoch": 3.535483870967742, + "grad_norm": 0.40567103028297424, + "learning_rate": 1.0130273547471176e-06, + "loss": 0.017, + "step": 548 + }, + { + "epoch": 3.541935483870968, + "grad_norm": 0.45447733998298645, + "learning_rate": 9.854562992478445e-07, + "loss": 0.0293, + "step": 549 + }, + { + "epoch": 3.5483870967741935, + "grad_norm": 0.40127208828926086, + "learning_rate": 9.582502303590798e-07, + "loss": 0.0151, + "step": 550 + }, + { + "epoch": 3.554838709677419, + "grad_norm": 0.32802486419677734, + "learning_rate": 9.314100104555066e-07, + "loss": 0.0101, + "step": 551 + }, + { + "epoch": 3.5612903225806454, + "grad_norm": 0.35557428002357483, + "learning_rate": 9.049364903151558e-07, + "loss": 0.0108, + "step": 552 + }, + { + "epoch": 3.567741935483871, + "grad_norm": 0.5105459094047546, + "learning_rate": 8.788305090924556e-07, + "loss": 0.0167, + "step": 553 + }, + { + "epoch": 3.574193548387097, + "grad_norm": 0.3577045202255249, + "learning_rate": 8.530928942916447e-07, + "loss": 0.0076, + "step": 554 + }, + { + "epoch": 3.5806451612903225, + "grad_norm": 0.2892685532569885, + "learning_rate": 8.277244617405102e-07, + "loss": 0.0077, + "step": 555 + }, + { + "epoch": 3.587096774193548, + "grad_norm": 0.47886940836906433, + "learning_rate": 8.027260155645546e-07, + "loss": 0.0109, + "step": 556 + }, + { + "epoch": 3.5935483870967744, + "grad_norm": 0.3236874043941498, + "learning_rate": 7.780983481614962e-07, + "loss": 0.0074, + "step": 557 + }, + { + "epoch": 3.6, + "grad_norm": 0.46806618571281433, + "learning_rate": 7.538422401761461e-07, + "loss": 0.0174, + "step": 558 + }, + { + "epoch": 3.606451612903226, + "grad_norm": 0.4186045229434967, + "learning_rate": 7.299584604756784e-07, + "loss": 0.0111, + "step": 559 + }, + { + "epoch": 3.6129032258064515, + "grad_norm": 0.4132605791091919, + "learning_rate": 7.064477661252483e-07, + "loss": 0.0132, + "step": 560 + }, + { + "epoch": 3.6193548387096772, + "grad_norm": 0.5827385783195496, + "learning_rate": 6.833109023639928e-07, + "loss": 0.017, + "step": 561 + }, + { + "epoch": 3.6258064516129034, + "grad_norm": 0.3105774521827698, + "learning_rate": 6.605486025814164e-07, + "loss": 0.0091, + "step": 562 + }, + { + "epoch": 3.632258064516129, + "grad_norm": 0.34796178340911865, + "learning_rate": 6.381615882941366e-07, + "loss": 0.0083, + "step": 563 + }, + { + "epoch": 3.638709677419355, + "grad_norm": 0.3462621867656708, + "learning_rate": 6.16150569123021e-07, + "loss": 0.0143, + "step": 564 + }, + { + "epoch": 3.6451612903225805, + "grad_norm": 0.4699903428554535, + "learning_rate": 5.945162427706888e-07, + "loss": 0.0145, + "step": 565 + }, + { + "epoch": 3.6516129032258062, + "grad_norm": 0.42084646224975586, + "learning_rate": 5.732592949993898e-07, + "loss": 0.015, + "step": 566 + }, + { + "epoch": 3.6580645161290324, + "grad_norm": 0.4539680778980255, + "learning_rate": 5.5238039960928e-07, + "loss": 0.0154, + "step": 567 + }, + { + "epoch": 3.664516129032258, + "grad_norm": 0.3853324353694916, + "learning_rate": 5.318802184170565e-07, + "loss": 0.0126, + "step": 568 + }, + { + "epoch": 3.670967741935484, + "grad_norm": 0.409679651260376, + "learning_rate": 5.117594012349735e-07, + "loss": 0.0143, + "step": 569 + }, + { + "epoch": 3.6774193548387095, + "grad_norm": 0.42005378007888794, + "learning_rate": 4.920185858502596e-07, + "loss": 0.0129, + "step": 570 + }, + { + "epoch": 3.6838709677419352, + "grad_norm": 0.34078460931777954, + "learning_rate": 4.7265839800488543e-07, + "loss": 0.0132, + "step": 571 + }, + { + "epoch": 3.6903225806451614, + "grad_norm": 0.5289260149002075, + "learning_rate": 4.5367945137573946e-07, + "loss": 0.0114, + "step": 572 + }, + { + "epoch": 3.696774193548387, + "grad_norm": 0.43742361664772034, + "learning_rate": 4.350823475551713e-07, + "loss": 0.0099, + "step": 573 + }, + { + "epoch": 3.703225806451613, + "grad_norm": 0.3581911623477936, + "learning_rate": 4.1686767603192344e-07, + "loss": 0.0153, + "step": 574 + }, + { + "epoch": 3.709677419354839, + "grad_norm": 0.3517850637435913, + "learning_rate": 3.990360141724478e-07, + "loss": 0.0123, + "step": 575 + }, + { + "epoch": 3.7161290322580647, + "grad_norm": 0.44253072142601013, + "learning_rate": 3.815879272025966e-07, + "loss": 0.0068, + "step": 576 + }, + { + "epoch": 3.7225806451612904, + "grad_norm": 0.3429562449455261, + "learning_rate": 3.6452396818971863e-07, + "loss": 0.0121, + "step": 577 + }, + { + "epoch": 3.729032258064516, + "grad_norm": 0.7084751129150391, + "learning_rate": 3.4784467802511797e-07, + "loss": 0.0145, + "step": 578 + }, + { + "epoch": 3.735483870967742, + "grad_norm": 0.388698011636734, + "learning_rate": 3.3155058540691037e-07, + "loss": 0.0107, + "step": 579 + }, + { + "epoch": 3.741935483870968, + "grad_norm": 0.41982078552246094, + "learning_rate": 3.1564220682327314e-07, + "loss": 0.014, + "step": 580 + }, + { + "epoch": 3.7483870967741937, + "grad_norm": 0.645720899105072, + "learning_rate": 3.001200465360593e-07, + "loss": 0.015, + "step": 581 + }, + { + "epoch": 3.7548387096774194, + "grad_norm": 0.5690763592720032, + "learning_rate": 2.8498459656482317e-07, + "loss": 0.0189, + "step": 582 + }, + { + "epoch": 3.761290322580645, + "grad_norm": 0.4491289556026459, + "learning_rate": 2.702363366712257e-07, + "loss": 0.0127, + "step": 583 + }, + { + "epoch": 3.767741935483871, + "grad_norm": 0.4925324618816376, + "learning_rate": 2.5587573434381895e-07, + "loss": 0.0138, + "step": 584 + }, + { + "epoch": 3.774193548387097, + "grad_norm": 0.6388445496559143, + "learning_rate": 2.41903244783237e-07, + "loss": 0.0145, + "step": 585 + }, + { + "epoch": 3.7806451612903227, + "grad_norm": 0.31376415491104126, + "learning_rate": 2.2831931088775904e-07, + "loss": 0.0135, + "step": 586 + }, + { + "epoch": 3.7870967741935484, + "grad_norm": 0.3549552261829376, + "learning_rate": 2.1512436323927604e-07, + "loss": 0.014, + "step": 587 + }, + { + "epoch": 3.793548387096774, + "grad_norm": 0.446304053068161, + "learning_rate": 2.0231882008963783e-07, + "loss": 0.0144, + "step": 588 + }, + { + "epoch": 3.8, + "grad_norm": 0.45615440607070923, + "learning_rate": 1.8990308734739976e-07, + "loss": 0.0143, + "step": 589 + }, + { + "epoch": 3.806451612903226, + "grad_norm": 0.3915248513221741, + "learning_rate": 1.7787755856495254e-07, + "loss": 0.0131, + "step": 590 + }, + { + "epoch": 3.8129032258064517, + "grad_norm": 0.2556948959827423, + "learning_rate": 1.6624261492605153e-07, + "loss": 0.0061, + "step": 591 + }, + { + "epoch": 3.8193548387096774, + "grad_norm": 0.5648970603942871, + "learning_rate": 1.5499862523372933e-07, + "loss": 0.011, + "step": 592 + }, + { + "epoch": 3.825806451612903, + "grad_norm": 0.30211833119392395, + "learning_rate": 1.4414594589860774e-07, + "loss": 0.0087, + "step": 593 + }, + { + "epoch": 3.832258064516129, + "grad_norm": 0.21295692026615143, + "learning_rate": 1.3368492092760142e-07, + "loss": 0.0059, + "step": 594 + }, + { + "epoch": 3.838709677419355, + "grad_norm": 0.49749764800071716, + "learning_rate": 1.2361588191300983e-07, + "loss": 0.0112, + "step": 595 + }, + { + "epoch": 3.8451612903225807, + "grad_norm": 0.3051888048648834, + "learning_rate": 1.139391480220145e-07, + "loss": 0.0077, + "step": 596 + }, + { + "epoch": 3.8516129032258064, + "grad_norm": 0.3583107590675354, + "learning_rate": 1.0465502598655114e-07, + "loss": 0.0115, + "step": 597 + }, + { + "epoch": 3.858064516129032, + "grad_norm": 0.4116378426551819, + "learning_rate": 9.576381009359508e-08, + "loss": 0.0127, + "step": 598 + }, + { + "epoch": 3.864516129032258, + "grad_norm": 0.3633911907672882, + "learning_rate": 8.726578217582993e-08, + "loss": 0.0109, + "step": 599 + }, + { + "epoch": 3.870967741935484, + "grad_norm": 0.35112428665161133, + "learning_rate": 7.916121160271572e-08, + "loss": 0.0112, + "step": 600 + }, + { + "epoch": 3.8774193548387097, + "grad_norm": 0.4247336685657501, + "learning_rate": 7.145035527194588e-08, + "loss": 0.0103, + "step": 601 + }, + { + "epoch": 3.8838709677419354, + "grad_norm": 0.36888250708580017, + "learning_rate": 6.413345760131057e-08, + "loss": 0.0145, + "step": 602 + }, + { + "epoch": 3.8903225806451616, + "grad_norm": 0.3833377957344055, + "learning_rate": 5.721075052094599e-08, + "loss": 0.0094, + "step": 603 + }, + { + "epoch": 3.896774193548387, + "grad_norm": 0.45575153827667236, + "learning_rate": 5.068245346598332e-08, + "loss": 0.0111, + "step": 604 + }, + { + "epoch": 3.903225806451613, + "grad_norm": 0.3035842776298523, + "learning_rate": 4.454877336958763e-08, + "loss": 0.0071, + "step": 605 + }, + { + "epoch": 3.9096774193548387, + "grad_norm": 0.3646605908870697, + "learning_rate": 3.8809904656410264e-08, + "loss": 0.0111, + "step": 606 + }, + { + "epoch": 3.9161290322580644, + "grad_norm": 0.3893778920173645, + "learning_rate": 3.346602923641473e-08, + "loss": 0.0084, + "step": 607 + }, + { + "epoch": 3.9225806451612906, + "grad_norm": 0.29794201254844666, + "learning_rate": 2.8517316499115932e-08, + "loss": 0.0069, + "step": 608 + }, + { + "epoch": 3.9290322580645163, + "grad_norm": 0.38667142391204834, + "learning_rate": 2.3963923308212288e-08, + "loss": 0.0144, + "step": 609 + }, + { + "epoch": 3.935483870967742, + "grad_norm": 0.39387401938438416, + "learning_rate": 1.9805993996606376e-08, + "loss": 0.0087, + "step": 610 + }, + { + "epoch": 3.9419354838709677, + "grad_norm": 0.3552229106426239, + "learning_rate": 1.604366036184052e-08, + "loss": 0.0093, + "step": 611 + }, + { + "epoch": 3.9483870967741934, + "grad_norm": 0.5617074966430664, + "learning_rate": 1.2677041661907085e-08, + "loss": 0.008, + "step": 612 + }, + { + "epoch": 3.9548387096774196, + "grad_norm": 0.3588564693927765, + "learning_rate": 9.706244611480674e-09, + "loss": 0.016, + "step": 613 + }, + { + "epoch": 3.9612903225806453, + "grad_norm": 0.36114564538002014, + "learning_rate": 7.131363378524991e-09, + "loss": 0.0113, + "step": 614 + }, + { + "epoch": 3.967741935483871, + "grad_norm": 0.5072866678237915, + "learning_rate": 4.952479581311897e-09, + "loss": 0.0128, + "step": 615 + }, + { + "epoch": 3.9741935483870967, + "grad_norm": 0.3469507396221161, + "learning_rate": 3.1696622858373716e-09, + "loss": 0.01, + "step": 616 + }, + { + "epoch": 3.9806451612903224, + "grad_norm": 0.6267412900924683, + "learning_rate": 1.7829680036274276e-09, + "loss": 0.0201, + "step": 617 + }, + { + "epoch": 3.9870967741935486, + "grad_norm": 0.4204852879047394, + "learning_rate": 7.924406899492698e-10, + "loss": 0.0108, + "step": 618 + }, + { + "epoch": 3.9935483870967743, + "grad_norm": 0.2933880090713501, + "learning_rate": 1.9811174241796127e-10, + "loss": 0.0113, + "step": 619 + }, + { + "epoch": 4.0, + "grad_norm": 0.3794356882572174, + "learning_rate": 0.0, + "loss": 0.0109, + "step": 620 + }, + { + "epoch": 4.0, + "step": 620, + "total_flos": 2.582134138035241e+17, + "train_loss": 0.053207253262911355, + "train_runtime": 778.335, + "train_samples_per_second": 25.449, + "train_steps_per_second": 0.797 + } + ], + "logging_steps": 1, + "max_steps": 620, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.582134138035241e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}