{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3869625520110958, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006934812760055479, "grad_norm": 61.60813903808594, "learning_rate": 8.650519031141869e-07, "loss": 2.7928, "mean_token_accuracy": 0.6783367753028869, "step": 5 }, { "epoch": 0.013869625520110958, "grad_norm": 35.546016693115234, "learning_rate": 1.7301038062283738e-06, "loss": 2.3942, "mean_token_accuracy": 0.6943186521530151, "step": 10 }, { "epoch": 0.020804438280166437, "grad_norm": 2.246945858001709, "learning_rate": 2.5951557093425604e-06, "loss": 1.202, "mean_token_accuracy": 0.7397322177886962, "step": 15 }, { "epoch": 0.027739251040221916, "grad_norm": 1.1429805755615234, "learning_rate": 3.4602076124567477e-06, "loss": 0.918, "mean_token_accuracy": 0.7564186692237854, "step": 20 }, { "epoch": 0.03467406380027739, "grad_norm": 0.9538511633872986, "learning_rate": 4.325259515570934e-06, "loss": 0.8104, "mean_token_accuracy": 0.7724308490753173, "step": 25 }, { "epoch": 0.04160887656033287, "grad_norm": 0.7069241404533386, "learning_rate": 5.190311418685121e-06, "loss": 0.7364, "mean_token_accuracy": 0.7827559828758239, "step": 30 }, { "epoch": 0.04854368932038835, "grad_norm": 0.4030636250972748, "learning_rate": 6.055363321799308e-06, "loss": 0.6835, "mean_token_accuracy": 0.7935511350631714, "step": 35 }, { "epoch": 0.05547850208044383, "grad_norm": 0.8142576217651367, "learning_rate": 6.920415224913495e-06, "loss": 0.6478, "mean_token_accuracy": 0.8010085463523865, "step": 40 }, { "epoch": 0.06241331484049931, "grad_norm": 0.2626665532588959, "learning_rate": 7.785467128027681e-06, "loss": 0.6267, "mean_token_accuracy": 0.8053073883056641, "step": 45 }, { "epoch": 0.06934812760055478, "grad_norm": 0.23942551016807556, "learning_rate": 8.650519031141868e-06, "loss": 0.6013, "mean_token_accuracy": 0.8112802267074585, "step": 50 }, { "epoch": 0.07628294036061026, "grad_norm": 0.20308136940002441, "learning_rate": 9.515570934256055e-06, "loss": 0.5769, "mean_token_accuracy": 0.8168688178062439, "step": 55 }, { "epoch": 0.08321775312066575, "grad_norm": 0.1854431927204132, "learning_rate": 1.0380622837370241e-05, "loss": 0.5805, "mean_token_accuracy": 0.815436840057373, "step": 60 }, { "epoch": 0.09015256588072122, "grad_norm": 0.1700541228055954, "learning_rate": 1.124567474048443e-05, "loss": 0.5652, "mean_token_accuracy": 0.8188095331192017, "step": 65 }, { "epoch": 0.0970873786407767, "grad_norm": 0.18573108315467834, "learning_rate": 1.2110726643598615e-05, "loss": 0.5524, "mean_token_accuracy": 0.8222507953643798, "step": 70 }, { "epoch": 0.10402219140083217, "grad_norm": 0.18843185901641846, "learning_rate": 1.2975778546712803e-05, "loss": 0.542, "mean_token_accuracy": 0.8249342203140259, "step": 75 }, { "epoch": 0.11095700416088766, "grad_norm": 0.21635942161083221, "learning_rate": 1.384083044982699e-05, "loss": 0.5401, "mean_token_accuracy": 0.8251730322837829, "step": 80 }, { "epoch": 0.11789181692094314, "grad_norm": 0.21325534582138062, "learning_rate": 1.4705882352941177e-05, "loss": 0.5404, "mean_token_accuracy": 0.8243620276451111, "step": 85 }, { "epoch": 0.12482662968099861, "grad_norm": 0.22691610455513, "learning_rate": 1.5570934256055363e-05, "loss": 0.5278, "mean_token_accuracy": 0.8283108592033386, "step": 90 }, { "epoch": 0.1317614424410541, "grad_norm": 0.2375083714723587, "learning_rate": 1.643598615916955e-05, "loss": 0.5198, "mean_token_accuracy": 0.8296842217445374, "step": 95 }, { "epoch": 0.13869625520110956, "grad_norm": 0.27802157402038574, "learning_rate": 1.7301038062283735e-05, "loss": 0.5346, "mean_token_accuracy": 0.825625765323639, "step": 100 }, { "epoch": 0.14563106796116504, "grad_norm": 0.3691716194152832, "learning_rate": 1.8166089965397926e-05, "loss": 0.5249, "mean_token_accuracy": 0.828523588180542, "step": 105 }, { "epoch": 0.15256588072122051, "grad_norm": 0.30235254764556885, "learning_rate": 1.903114186851211e-05, "loss": 0.514, "mean_token_accuracy": 0.8320501446723938, "step": 110 }, { "epoch": 0.15950069348127602, "grad_norm": 0.3447076380252838, "learning_rate": 1.98961937716263e-05, "loss": 0.5203, "mean_token_accuracy": 0.8298335313796997, "step": 115 }, { "epoch": 0.1664355062413315, "grad_norm": 0.28185489773750305, "learning_rate": 2.0761245674740483e-05, "loss": 0.5305, "mean_token_accuracy": 0.8264262914657593, "step": 120 }, { "epoch": 0.17337031900138697, "grad_norm": 0.2629449963569641, "learning_rate": 2.1626297577854674e-05, "loss": 0.5167, "mean_token_accuracy": 0.8303680658340454, "step": 125 }, { "epoch": 0.18030513176144244, "grad_norm": 0.2788124680519104, "learning_rate": 2.249134948096886e-05, "loss": 0.5275, "mean_token_accuracy": 0.8274267673492431, "step": 130 }, { "epoch": 0.18723994452149792, "grad_norm": 0.24550440907478333, "learning_rate": 2.3356401384083046e-05, "loss": 0.5073, "mean_token_accuracy": 0.8328269720077515, "step": 135 }, { "epoch": 0.1941747572815534, "grad_norm": 0.7636565566062927, "learning_rate": 2.422145328719723e-05, "loss": 0.5138, "mean_token_accuracy": 0.8308726906776428, "step": 140 }, { "epoch": 0.20110957004160887, "grad_norm": 0.6163385510444641, "learning_rate": 2.508650519031142e-05, "loss": 0.5123, "mean_token_accuracy": 0.8311893105506897, "step": 145 }, { "epoch": 0.20804438280166435, "grad_norm": 0.3808706998825073, "learning_rate": 2.5951557093425606e-05, "loss": 0.5018, "mean_token_accuracy": 0.8343647360801697, "step": 150 }, { "epoch": 0.21497919556171982, "grad_norm": 0.2565021216869354, "learning_rate": 2.6816608996539794e-05, "loss": 0.4943, "mean_token_accuracy": 0.8362817883491516, "step": 155 }, { "epoch": 0.22191400832177532, "grad_norm": 0.3511087894439697, "learning_rate": 2.768166089965398e-05, "loss": 0.4933, "mean_token_accuracy": 0.8366880297660828, "step": 160 }, { "epoch": 0.2288488210818308, "grad_norm": 0.4006827175617218, "learning_rate": 2.8546712802768166e-05, "loss": 0.4906, "mean_token_accuracy": 0.837351131439209, "step": 165 }, { "epoch": 0.23578363384188628, "grad_norm": 0.4149070978164673, "learning_rate": 2.9411764705882354e-05, "loss": 0.4988, "mean_token_accuracy": 0.8351827621459961, "step": 170 }, { "epoch": 0.24271844660194175, "grad_norm": 0.32881560921669006, "learning_rate": 3.0276816608996538e-05, "loss": 0.5085, "mean_token_accuracy": 0.831884253025055, "step": 175 }, { "epoch": 0.24965325936199723, "grad_norm": 0.46366971731185913, "learning_rate": 3.1141868512110726e-05, "loss": 0.4964, "mean_token_accuracy": 0.8355090618133545, "step": 180 }, { "epoch": 0.2565880721220527, "grad_norm": 0.838777482509613, "learning_rate": 3.200692041522492e-05, "loss": 0.5078, "mean_token_accuracy": 0.8320568442344666, "step": 185 }, { "epoch": 0.2635228848821082, "grad_norm": 30.57767677307129, "learning_rate": 3.28719723183391e-05, "loss": 0.4978, "mean_token_accuracy": 0.8360116839408874, "step": 190 }, { "epoch": 0.27045769764216365, "grad_norm": 0.5885879993438721, "learning_rate": 3.373702422145329e-05, "loss": 0.497, "mean_token_accuracy": 0.8356186389923096, "step": 195 }, { "epoch": 0.27739251040221913, "grad_norm": 0.3610420525074005, "learning_rate": 3.460207612456747e-05, "loss": 0.4989, "mean_token_accuracy": 0.8350513100624084, "step": 200 }, { "epoch": 0.2843273231622746, "grad_norm": 0.3954995572566986, "learning_rate": 3.546712802768166e-05, "loss": 0.5011, "mean_token_accuracy": 0.83415766954422, "step": 205 }, { "epoch": 0.2912621359223301, "grad_norm": 0.3071337342262268, "learning_rate": 3.633217993079585e-05, "loss": 0.5255, "mean_token_accuracy": 0.8285403490066529, "step": 210 }, { "epoch": 0.29819694868238555, "grad_norm": 0.31758391857147217, "learning_rate": 3.719723183391004e-05, "loss": 0.4954, "mean_token_accuracy": 0.835390031337738, "step": 215 }, { "epoch": 0.30513176144244103, "grad_norm": 0.3296087086200714, "learning_rate": 3.806228373702422e-05, "loss": 0.4923, "mean_token_accuracy": 0.8361375451087951, "step": 220 }, { "epoch": 0.3120665742024965, "grad_norm": 0.2980894446372986, "learning_rate": 3.892733564013841e-05, "loss": 0.4996, "mean_token_accuracy": 0.8341476917266846, "step": 225 }, { "epoch": 0.31900138696255204, "grad_norm": 0.2892495095729828, "learning_rate": 3.97923875432526e-05, "loss": 0.4855, "mean_token_accuracy": 0.8382086515426636, "step": 230 }, { "epoch": 0.3259361997226075, "grad_norm": 0.29287102818489075, "learning_rate": 4.065743944636679e-05, "loss": 0.4944, "mean_token_accuracy": 0.8353524923324585, "step": 235 }, { "epoch": 0.332871012482663, "grad_norm": 0.28245487809181213, "learning_rate": 4.1522491349480966e-05, "loss": 0.4887, "mean_token_accuracy": 0.8373544692993165, "step": 240 }, { "epoch": 0.33980582524271846, "grad_norm": 0.23551802337169647, "learning_rate": 4.238754325259516e-05, "loss": 0.4925, "mean_token_accuracy": 0.8361364006996155, "step": 245 }, { "epoch": 0.34674063800277394, "grad_norm": 0.24266427755355835, "learning_rate": 4.325259515570935e-05, "loss": 0.4759, "mean_token_accuracy": 0.8410738468170166, "step": 250 }, { "epoch": 0.3536754507628294, "grad_norm": 0.33316895365715027, "learning_rate": 4.411764705882353e-05, "loss": 0.4902, "mean_token_accuracy": 0.8370885252952576, "step": 255 }, { "epoch": 0.3606102635228849, "grad_norm": 0.5113539099693298, "learning_rate": 4.498269896193772e-05, "loss": 0.4918, "mean_token_accuracy": 0.8364068984985351, "step": 260 }, { "epoch": 0.36754507628294036, "grad_norm": 0.3733905851840973, "learning_rate": 4.58477508650519e-05, "loss": 0.49, "mean_token_accuracy": 0.8370036244392395, "step": 265 }, { "epoch": 0.37447988904299584, "grad_norm": 0.4112997353076935, "learning_rate": 4.671280276816609e-05, "loss": 0.4932, "mean_token_accuracy": 0.8356328129768371, "step": 270 }, { "epoch": 0.3814147018030513, "grad_norm": 0.5121487379074097, "learning_rate": 4.7577854671280283e-05, "loss": 0.479, "mean_token_accuracy": 0.839626955986023, "step": 275 }, { "epoch": 0.3883495145631068, "grad_norm": 0.36294957995414734, "learning_rate": 4.844290657439446e-05, "loss": 0.4829, "mean_token_accuracy": 0.8391167283058166, "step": 280 }, { "epoch": 0.39528432732316227, "grad_norm": 0.3162820339202881, "learning_rate": 4.930795847750865e-05, "loss": 0.4899, "mean_token_accuracy": 0.8368083834648132, "step": 285 }, { "epoch": 0.40221914008321774, "grad_norm": 0.3973437547683716, "learning_rate": 4.9980732177263974e-05, "loss": 0.4864, "mean_token_accuracy": 0.8374906539916992, "step": 290 }, { "epoch": 0.4091539528432732, "grad_norm": 0.5423433184623718, "learning_rate": 4.9884393063583816e-05, "loss": 0.4907, "mean_token_accuracy": 0.8373413920402527, "step": 295 }, { "epoch": 0.4160887656033287, "grad_norm": 0.39722123742103577, "learning_rate": 4.9788053949903666e-05, "loss": 0.4961, "mean_token_accuracy": 0.8351489901542664, "step": 300 }, { "epoch": 0.42302357836338417, "grad_norm": 0.34169071912765503, "learning_rate": 4.969171483622351e-05, "loss": 0.4891, "mean_token_accuracy": 0.8370493412017822, "step": 305 }, { "epoch": 0.42995839112343964, "grad_norm": 0.3429335951805115, "learning_rate": 4.959537572254335e-05, "loss": 0.4794, "mean_token_accuracy": 0.8396916627883911, "step": 310 }, { "epoch": 0.4368932038834951, "grad_norm": 0.3266272246837616, "learning_rate": 4.94990366088632e-05, "loss": 0.4757, "mean_token_accuracy": 0.8405494570732117, "step": 315 }, { "epoch": 0.44382801664355065, "grad_norm": 0.2874930202960968, "learning_rate": 4.940269749518305e-05, "loss": 0.4978, "mean_token_accuracy": 0.8344841122627258, "step": 320 }, { "epoch": 0.4507628294036061, "grad_norm": 0.2812349498271942, "learning_rate": 4.930635838150289e-05, "loss": 0.4839, "mean_token_accuracy": 0.8383953332901001, "step": 325 }, { "epoch": 0.4576976421636616, "grad_norm": 0.25296345353126526, "learning_rate": 4.921001926782274e-05, "loss": 0.4738, "mean_token_accuracy": 0.8412886261940002, "step": 330 }, { "epoch": 0.4646324549237171, "grad_norm": 0.22165291011333466, "learning_rate": 4.9113680154142584e-05, "loss": 0.4867, "mean_token_accuracy": 0.8379201173782349, "step": 335 }, { "epoch": 0.47156726768377255, "grad_norm": 0.2551758289337158, "learning_rate": 4.9017341040462426e-05, "loss": 0.4786, "mean_token_accuracy": 0.8399594306945801, "step": 340 }, { "epoch": 0.478502080443828, "grad_norm": 0.25708919763565063, "learning_rate": 4.8921001926782276e-05, "loss": 0.48, "mean_token_accuracy": 0.8395551085472107, "step": 345 }, { "epoch": 0.4854368932038835, "grad_norm": 0.1992408186197281, "learning_rate": 4.8824662813102125e-05, "loss": 0.4714, "mean_token_accuracy": 0.8418668508529663, "step": 350 }, { "epoch": 0.492371705963939, "grad_norm": 0.23445720970630646, "learning_rate": 4.872832369942197e-05, "loss": 0.471, "mean_token_accuracy": 0.8421580553054809, "step": 355 }, { "epoch": 0.49930651872399445, "grad_norm": 0.31462928652763367, "learning_rate": 4.863198458574181e-05, "loss": 0.4711, "mean_token_accuracy": 0.842027747631073, "step": 360 }, { "epoch": 0.5062413314840499, "grad_norm": 0.24767646193504333, "learning_rate": 4.853564547206166e-05, "loss": 0.4717, "mean_token_accuracy": 0.8417503118515015, "step": 365 }, { "epoch": 0.5131761442441054, "grad_norm": 0.2389938235282898, "learning_rate": 4.84393063583815e-05, "loss": 0.4677, "mean_token_accuracy": 0.8431912064552307, "step": 370 }, { "epoch": 0.5201109570041609, "grad_norm": 0.29998722672462463, "learning_rate": 4.834296724470135e-05, "loss": 0.4877, "mean_token_accuracy": 0.8374402284622192, "step": 375 }, { "epoch": 0.5270457697642164, "grad_norm": 0.2877121865749359, "learning_rate": 4.82466281310212e-05, "loss": 0.4863, "mean_token_accuracy": 0.8380719065666199, "step": 380 }, { "epoch": 0.5339805825242718, "grad_norm": 0.24628062546253204, "learning_rate": 4.815028901734104e-05, "loss": 0.4665, "mean_token_accuracy": 0.8434135437011718, "step": 385 }, { "epoch": 0.5409153952843273, "grad_norm": 0.24347947537899017, "learning_rate": 4.8053949903660886e-05, "loss": 0.476, "mean_token_accuracy": 0.8404138565063477, "step": 390 }, { "epoch": 0.5478502080443828, "grad_norm": 0.20724909007549286, "learning_rate": 4.7957610789980735e-05, "loss": 0.4881, "mean_token_accuracy": 0.8372583389282227, "step": 395 }, { "epoch": 0.5547850208044383, "grad_norm": 0.2162594497203827, "learning_rate": 4.786127167630058e-05, "loss": 0.4726, "mean_token_accuracy": 0.842011570930481, "step": 400 }, { "epoch": 0.5617198335644937, "grad_norm": 0.34494099020957947, "learning_rate": 4.776493256262042e-05, "loss": 0.4783, "mean_token_accuracy": 0.8399308085441589, "step": 405 }, { "epoch": 0.5686546463245492, "grad_norm": 0.24402566254138947, "learning_rate": 4.7668593448940276e-05, "loss": 0.4953, "mean_token_accuracy": 0.8352864623069763, "step": 410 }, { "epoch": 0.5755894590846047, "grad_norm": 0.2124612033367157, "learning_rate": 4.757225433526012e-05, "loss": 0.4849, "mean_token_accuracy": 0.8380748987197876, "step": 415 }, { "epoch": 0.5825242718446602, "grad_norm": 0.20577934384346008, "learning_rate": 4.747591522157996e-05, "loss": 0.4591, "mean_token_accuracy": 0.845665693283081, "step": 420 }, { "epoch": 0.5894590846047156, "grad_norm": 0.2838655710220337, "learning_rate": 4.737957610789981e-05, "loss": 0.4709, "mean_token_accuracy": 0.8418583750724793, "step": 425 }, { "epoch": 0.5963938973647711, "grad_norm": 0.2222902923822403, "learning_rate": 4.7283236994219653e-05, "loss": 0.4817, "mean_token_accuracy": 0.8388337612152099, "step": 430 }, { "epoch": 0.6033287101248266, "grad_norm": 0.25565460324287415, "learning_rate": 4.7186897880539496e-05, "loss": 0.4724, "mean_token_accuracy": 0.8415215969085693, "step": 435 }, { "epoch": 0.6102635228848821, "grad_norm": 0.680081844329834, "learning_rate": 4.709055876685935e-05, "loss": 0.4777, "mean_token_accuracy": 0.8402902245521545, "step": 440 }, { "epoch": 0.6171983356449375, "grad_norm": 0.3035682141780853, "learning_rate": 4.6994219653179195e-05, "loss": 0.4749, "mean_token_accuracy": 0.8405117988586426, "step": 445 }, { "epoch": 0.624133148404993, "grad_norm": 0.22393807768821716, "learning_rate": 4.689788053949904e-05, "loss": 0.4735, "mean_token_accuracy": 0.8410566568374633, "step": 450 }, { "epoch": 0.6310679611650486, "grad_norm": 0.23452860116958618, "learning_rate": 4.6801541425818887e-05, "loss": 0.4798, "mean_token_accuracy": 0.8394344925880433, "step": 455 }, { "epoch": 0.6380027739251041, "grad_norm": 0.21135355532169342, "learning_rate": 4.670520231213873e-05, "loss": 0.4783, "mean_token_accuracy": 0.8398800015449523, "step": 460 }, { "epoch": 0.6449375866851595, "grad_norm": 0.2495516985654831, "learning_rate": 4.660886319845857e-05, "loss": 0.4769, "mean_token_accuracy": 0.8407980084419251, "step": 465 }, { "epoch": 0.651872399445215, "grad_norm": 0.25724372267723083, "learning_rate": 4.651252408477843e-05, "loss": 0.4764, "mean_token_accuracy": 0.8402070879936219, "step": 470 }, { "epoch": 0.6588072122052705, "grad_norm": 0.28974995017051697, "learning_rate": 4.641618497109827e-05, "loss": 0.468, "mean_token_accuracy": 0.8425545215606689, "step": 475 }, { "epoch": 0.665742024965326, "grad_norm": 0.26298555731773376, "learning_rate": 4.631984585741811e-05, "loss": 0.4752, "mean_token_accuracy": 0.8405273199081421, "step": 480 }, { "epoch": 0.6726768377253814, "grad_norm": 0.3188522756099701, "learning_rate": 4.622350674373796e-05, "loss": 0.4683, "mean_token_accuracy": 0.8426392555236817, "step": 485 }, { "epoch": 0.6796116504854369, "grad_norm": 0.2528276741504669, "learning_rate": 4.6127167630057805e-05, "loss": 0.4753, "mean_token_accuracy": 0.840662169456482, "step": 490 }, { "epoch": 0.6865464632454924, "grad_norm": 0.3695737421512604, "learning_rate": 4.603082851637765e-05, "loss": 0.501, "mean_token_accuracy": 0.8371694445610046, "step": 495 }, { "epoch": 0.6934812760055479, "grad_norm": 0.31206727027893066, "learning_rate": 4.59344894026975e-05, "loss": 0.478, "mean_token_accuracy": 0.8401562452316285, "step": 500 }, { "epoch": 0.7004160887656034, "grad_norm": 3.478522539138794, "learning_rate": 4.5838150289017346e-05, "loss": 0.49, "mean_token_accuracy": 0.8365014433860779, "step": 505 }, { "epoch": 0.7073509015256588, "grad_norm": 0.4430016875267029, "learning_rate": 4.574181117533719e-05, "loss": 0.47, "mean_token_accuracy": 0.8422938823699951, "step": 510 }, { "epoch": 0.7142857142857143, "grad_norm": 0.28713470697402954, "learning_rate": 4.564547206165704e-05, "loss": 0.4786, "mean_token_accuracy": 0.8401166200637817, "step": 515 }, { "epoch": 0.7212205270457698, "grad_norm": 0.2158370316028595, "learning_rate": 4.554913294797688e-05, "loss": 0.4703, "mean_token_accuracy": 0.8421276092529297, "step": 520 }, { "epoch": 0.7281553398058253, "grad_norm": 0.2426484376192093, "learning_rate": 4.545279383429672e-05, "loss": 0.469, "mean_token_accuracy": 0.8426563143730164, "step": 525 }, { "epoch": 0.7350901525658807, "grad_norm": 0.27153995633125305, "learning_rate": 4.535645472061657e-05, "loss": 0.4754, "mean_token_accuracy": 0.8406094431877136, "step": 530 }, { "epoch": 0.7420249653259362, "grad_norm": 0.1991535872220993, "learning_rate": 4.526011560693642e-05, "loss": 0.4782, "mean_token_accuracy": 0.8397158980369568, "step": 535 }, { "epoch": 0.7489597780859917, "grad_norm": 0.15923242270946503, "learning_rate": 4.5163776493256264e-05, "loss": 0.4563, "mean_token_accuracy": 0.8461790800094604, "step": 540 }, { "epoch": 0.7558945908460472, "grad_norm": 0.18306083977222443, "learning_rate": 4.5067437379576114e-05, "loss": 0.4791, "mean_token_accuracy": 0.8393635034561158, "step": 545 }, { "epoch": 0.7628294036061026, "grad_norm": 0.24309256672859192, "learning_rate": 4.4971098265895956e-05, "loss": 0.4777, "mean_token_accuracy": 0.8401144862174987, "step": 550 }, { "epoch": 0.7697642163661581, "grad_norm": 0.20910784602165222, "learning_rate": 4.48747591522158e-05, "loss": 0.4728, "mean_token_accuracy": 0.8417426466941833, "step": 555 }, { "epoch": 0.7766990291262136, "grad_norm": 0.1896984726190567, "learning_rate": 4.477842003853565e-05, "loss": 0.4557, "mean_token_accuracy": 0.8461586833000183, "step": 560 }, { "epoch": 0.7836338418862691, "grad_norm": 0.18798613548278809, "learning_rate": 4.46820809248555e-05, "loss": 0.457, "mean_token_accuracy": 0.8459754705429077, "step": 565 }, { "epoch": 0.7905686546463245, "grad_norm": 0.18959036469459534, "learning_rate": 4.458574181117534e-05, "loss": 0.4633, "mean_token_accuracy": 0.8437102913856507, "step": 570 }, { "epoch": 0.79750346740638, "grad_norm": 0.16292130947113037, "learning_rate": 4.448940269749519e-05, "loss": 0.4749, "mean_token_accuracy": 0.8404599308967591, "step": 575 }, { "epoch": 0.8044382801664355, "grad_norm": 0.17686040699481964, "learning_rate": 4.439306358381503e-05, "loss": 0.4601, "mean_token_accuracy": 0.844899308681488, "step": 580 }, { "epoch": 0.811373092926491, "grad_norm": 0.1865614652633667, "learning_rate": 4.4296724470134875e-05, "loss": 0.4533, "mean_token_accuracy": 0.846677553653717, "step": 585 }, { "epoch": 0.8183079056865464, "grad_norm": 0.2037810981273651, "learning_rate": 4.4200385356454724e-05, "loss": 0.4575, "mean_token_accuracy": 0.8457266449928283, "step": 590 }, { "epoch": 0.8252427184466019, "grad_norm": 0.16701985895633698, "learning_rate": 4.4104046242774566e-05, "loss": 0.466, "mean_token_accuracy": 0.8428797006607056, "step": 595 }, { "epoch": 0.8321775312066574, "grad_norm": 0.19714096188545227, "learning_rate": 4.4007707129094416e-05, "loss": 0.4696, "mean_token_accuracy": 0.8422728657722474, "step": 600 }, { "epoch": 0.8391123439667129, "grad_norm": 0.20772860944271088, "learning_rate": 4.391136801541426e-05, "loss": 0.4635, "mean_token_accuracy": 0.8438523054122925, "step": 605 }, { "epoch": 0.8460471567267683, "grad_norm": 0.35546374320983887, "learning_rate": 4.381502890173411e-05, "loss": 0.4665, "mean_token_accuracy": 0.8430918097496033, "step": 610 }, { "epoch": 0.8529819694868238, "grad_norm": 0.19986563920974731, "learning_rate": 4.371868978805395e-05, "loss": 0.4742, "mean_token_accuracy": 0.8409379243850708, "step": 615 }, { "epoch": 0.8599167822468793, "grad_norm": 0.4013294279575348, "learning_rate": 4.36223506743738e-05, "loss": 0.4673, "mean_token_accuracy": 0.8426662087440491, "step": 620 }, { "epoch": 0.8668515950069348, "grad_norm": 0.29566317796707153, "learning_rate": 4.352601156069364e-05, "loss": 0.4837, "mean_token_accuracy": 0.8380556702613831, "step": 625 }, { "epoch": 0.8737864077669902, "grad_norm": 0.24461045861244202, "learning_rate": 4.342967244701349e-05, "loss": 0.4648, "mean_token_accuracy": 0.8434231281280518, "step": 630 }, { "epoch": 0.8807212205270458, "grad_norm": 0.2197730541229248, "learning_rate": 4.3333333333333334e-05, "loss": 0.4585, "mean_token_accuracy": 0.8448979973793029, "step": 635 }, { "epoch": 0.8876560332871013, "grad_norm": 0.22158759832382202, "learning_rate": 4.3236994219653183e-05, "loss": 0.4678, "mean_token_accuracy": 0.8427410125732422, "step": 640 }, { "epoch": 0.8945908460471568, "grad_norm": 0.17014814913272858, "learning_rate": 4.3140655105973026e-05, "loss": 0.4706, "mean_token_accuracy": 0.8416074395179749, "step": 645 }, { "epoch": 0.9015256588072122, "grad_norm": 0.22929687798023224, "learning_rate": 4.304431599229287e-05, "loss": 0.4753, "mean_token_accuracy": 0.8403880834579468, "step": 650 }, { "epoch": 0.9084604715672677, "grad_norm": 0.20894835889339447, "learning_rate": 4.294797687861272e-05, "loss": 0.4734, "mean_token_accuracy": 0.8410162568092346, "step": 655 }, { "epoch": 0.9153952843273232, "grad_norm": 0.18031327426433563, "learning_rate": 4.285163776493257e-05, "loss": 0.4536, "mean_token_accuracy": 0.8469532251358032, "step": 660 }, { "epoch": 0.9223300970873787, "grad_norm": 0.17288991808891296, "learning_rate": 4.275529865125241e-05, "loss": 0.4611, "mean_token_accuracy": 0.8443895936012268, "step": 665 }, { "epoch": 0.9292649098474342, "grad_norm": 0.1980760544538498, "learning_rate": 4.265895953757226e-05, "loss": 0.484, "mean_token_accuracy": 0.8379009962081909, "step": 670 }, { "epoch": 0.9361997226074896, "grad_norm": 0.20848602056503296, "learning_rate": 4.25626204238921e-05, "loss": 0.4771, "mean_token_accuracy": 0.8398370265960693, "step": 675 }, { "epoch": 0.9431345353675451, "grad_norm": 0.1636408418416977, "learning_rate": 4.2466281310211944e-05, "loss": 0.4578, "mean_token_accuracy": 0.845670223236084, "step": 680 }, { "epoch": 0.9500693481276006, "grad_norm": 0.22376923263072968, "learning_rate": 4.2369942196531794e-05, "loss": 0.4652, "mean_token_accuracy": 0.8431706428527832, "step": 685 }, { "epoch": 0.957004160887656, "grad_norm": 0.21399416029453278, "learning_rate": 4.2273603082851636e-05, "loss": 0.4537, "mean_token_accuracy": 0.8464810252189636, "step": 690 }, { "epoch": 0.9639389736477115, "grad_norm": 2.5790159702301025, "learning_rate": 4.2177263969171485e-05, "loss": 0.4754, "mean_token_accuracy": 0.8421392440795898, "step": 695 }, { "epoch": 0.970873786407767, "grad_norm": 0.2648729085922241, "learning_rate": 4.2080924855491335e-05, "loss": 0.469, "mean_token_accuracy": 0.8423485517501831, "step": 700 }, { "epoch": 0.9778085991678225, "grad_norm": 0.20691435039043427, "learning_rate": 4.198458574181118e-05, "loss": 0.4534, "mean_token_accuracy": 0.8466127276420593, "step": 705 }, { "epoch": 0.984743411927878, "grad_norm": 0.2122969925403595, "learning_rate": 4.188824662813102e-05, "loss": 0.4744, "mean_token_accuracy": 0.843373692035675, "step": 710 }, { "epoch": 0.9916782246879334, "grad_norm": 0.18356889486312866, "learning_rate": 4.179190751445087e-05, "loss": 0.4735, "mean_token_accuracy": 0.840711236000061, "step": 715 }, { "epoch": 0.9986130374479889, "grad_norm": 0.2710322141647339, "learning_rate": 4.169556840077071e-05, "loss": 0.4893, "mean_token_accuracy": 0.8400939464569092, "step": 720 }, { "epoch": 1.0055478502080444, "grad_norm": 0.28685542941093445, "learning_rate": 4.159922928709056e-05, "loss": 0.4413, "mean_token_accuracy": 0.8504527807235718, "step": 725 }, { "epoch": 1.0124826629680999, "grad_norm": 0.24674533307552338, "learning_rate": 4.150289017341041e-05, "loss": 0.4098, "mean_token_accuracy": 0.8587909460067749, "step": 730 }, { "epoch": 1.0194174757281553, "grad_norm": 0.21753250062465668, "learning_rate": 4.140655105973025e-05, "loss": 0.3935, "mean_token_accuracy": 0.8634741544723511, "step": 735 }, { "epoch": 1.0263522884882108, "grad_norm": 0.20492789149284363, "learning_rate": 4.1310211946050096e-05, "loss": 0.4068, "mean_token_accuracy": 0.8603113770484925, "step": 740 }, { "epoch": 1.0332871012482663, "grad_norm": 0.16923396289348602, "learning_rate": 4.1213872832369945e-05, "loss": 0.4185, "mean_token_accuracy": 0.8562816023826599, "step": 745 }, { "epoch": 1.0402219140083218, "grad_norm": 0.18504321575164795, "learning_rate": 4.111753371868979e-05, "loss": 0.4083, "mean_token_accuracy": 0.8588919401168823, "step": 750 }, { "epoch": 1.0471567267683772, "grad_norm": 0.15754340589046478, "learning_rate": 4.102119460500964e-05, "loss": 0.4001, "mean_token_accuracy": 0.8617303729057312, "step": 755 }, { "epoch": 1.0540915395284327, "grad_norm": 0.16705656051635742, "learning_rate": 4.0924855491329486e-05, "loss": 0.4124, "mean_token_accuracy": 0.8577338337898255, "step": 760 }, { "epoch": 1.0610263522884882, "grad_norm": 0.1913621723651886, "learning_rate": 4.082851637764933e-05, "loss": 0.4027, "mean_token_accuracy": 0.860604989528656, "step": 765 }, { "epoch": 1.0679611650485437, "grad_norm": 0.1807246059179306, "learning_rate": 4.073217726396917e-05, "loss": 0.4154, "mean_token_accuracy": 0.8569133520126343, "step": 770 }, { "epoch": 1.0748959778085991, "grad_norm": 0.16904115676879883, "learning_rate": 4.063583815028902e-05, "loss": 0.4043, "mean_token_accuracy": 0.8603330969810485, "step": 775 }, { "epoch": 1.0818307905686546, "grad_norm": 0.13820037245750427, "learning_rate": 4.053949903660886e-05, "loss": 0.4032, "mean_token_accuracy": 0.8607820630073547, "step": 780 }, { "epoch": 1.08876560332871, "grad_norm": 0.15458045899868011, "learning_rate": 4.0443159922928706e-05, "loss": 0.4126, "mean_token_accuracy": 0.8577965140342713, "step": 785 }, { "epoch": 1.0957004160887656, "grad_norm": 0.14621621370315552, "learning_rate": 4.034682080924856e-05, "loss": 0.4164, "mean_token_accuracy": 0.8568703651428222, "step": 790 }, { "epoch": 1.102635228848821, "grad_norm": 0.22418245673179626, "learning_rate": 4.0250481695568404e-05, "loss": 0.4151, "mean_token_accuracy": 0.8572113513946533, "step": 795 }, { "epoch": 1.1095700416088765, "grad_norm": 0.18166805803775787, "learning_rate": 4.015414258188825e-05, "loss": 0.4236, "mean_token_accuracy": 0.8545473575592041, "step": 800 }, { "epoch": 1.116504854368932, "grad_norm": 0.19410911202430725, "learning_rate": 4.0057803468208096e-05, "loss": 0.4081, "mean_token_accuracy": 0.8594057202339173, "step": 805 }, { "epoch": 1.1234396671289875, "grad_norm": 0.15663549304008484, "learning_rate": 3.996146435452794e-05, "loss": 0.41, "mean_token_accuracy": 0.8585174441337585, "step": 810 }, { "epoch": 1.130374479889043, "grad_norm": 0.2926901578903198, "learning_rate": 3.986512524084778e-05, "loss": 0.4088, "mean_token_accuracy": 0.8590117692947388, "step": 815 }, { "epoch": 1.1373092926490984, "grad_norm": 0.14440152049064636, "learning_rate": 3.976878612716764e-05, "loss": 0.4029, "mean_token_accuracy": 0.8609296917915344, "step": 820 }, { "epoch": 1.1442441054091539, "grad_norm": 0.18435537815093994, "learning_rate": 3.967244701348748e-05, "loss": 0.406, "mean_token_accuracy": 0.8599510669708252, "step": 825 }, { "epoch": 1.1511789181692094, "grad_norm": 0.16614344716072083, "learning_rate": 3.957610789980732e-05, "loss": 0.4165, "mean_token_accuracy": 0.8565038800239563, "step": 830 }, { "epoch": 1.1581137309292648, "grad_norm": 0.180514857172966, "learning_rate": 3.947976878612717e-05, "loss": 0.4233, "mean_token_accuracy": 0.854881489276886, "step": 835 }, { "epoch": 1.1650485436893203, "grad_norm": 0.17873796820640564, "learning_rate": 3.9383429672447015e-05, "loss": 0.4142, "mean_token_accuracy": 0.8575613379478455, "step": 840 }, { "epoch": 1.1719833564493758, "grad_norm": 0.17171607911586761, "learning_rate": 3.928709055876686e-05, "loss": 0.4199, "mean_token_accuracy": 0.8555831432342529, "step": 845 }, { "epoch": 1.1789181692094313, "grad_norm": 0.2052180916070938, "learning_rate": 3.9190751445086707e-05, "loss": 0.4084, "mean_token_accuracy": 0.8590785980224609, "step": 850 }, { "epoch": 1.1858529819694867, "grad_norm": 0.16104774177074432, "learning_rate": 3.9094412331406556e-05, "loss": 0.4064, "mean_token_accuracy": 0.8598842978477478, "step": 855 }, { "epoch": 1.1927877947295422, "grad_norm": 0.16743043065071106, "learning_rate": 3.89980732177264e-05, "loss": 0.4136, "mean_token_accuracy": 0.857661247253418, "step": 860 }, { "epoch": 1.1997226074895977, "grad_norm": 0.15085460245609283, "learning_rate": 3.890173410404625e-05, "loss": 0.4143, "mean_token_accuracy": 0.8573906064033509, "step": 865 }, { "epoch": 1.2066574202496532, "grad_norm": 0.14390355348587036, "learning_rate": 3.880539499036609e-05, "loss": 0.4221, "mean_token_accuracy": 0.8549758553504944, "step": 870 }, { "epoch": 1.2135922330097086, "grad_norm": 0.170955091714859, "learning_rate": 3.870905587668593e-05, "loss": 0.417, "mean_token_accuracy": 0.8564952373504638, "step": 875 }, { "epoch": 1.2205270457697641, "grad_norm": 0.17432747781276703, "learning_rate": 3.861271676300578e-05, "loss": 0.4065, "mean_token_accuracy": 0.8594078302383423, "step": 880 }, { "epoch": 1.2274618585298196, "grad_norm": 0.15886807441711426, "learning_rate": 3.851637764932563e-05, "loss": 0.4191, "mean_token_accuracy": 0.8560209155082703, "step": 885 }, { "epoch": 1.234396671289875, "grad_norm": 0.22739961743354797, "learning_rate": 3.8420038535645474e-05, "loss": 0.4136, "mean_token_accuracy": 0.8580429792404175, "step": 890 }, { "epoch": 1.2413314840499305, "grad_norm": 0.1761876940727234, "learning_rate": 3.832369942196532e-05, "loss": 0.4058, "mean_token_accuracy": 0.8598258137702942, "step": 895 }, { "epoch": 1.248266296809986, "grad_norm": 0.8043875098228455, "learning_rate": 3.8227360308285166e-05, "loss": 0.4146, "mean_token_accuracy": 0.8571544885635376, "step": 900 }, { "epoch": 1.2552011095700415, "grad_norm": 0.18043817579746246, "learning_rate": 3.813102119460501e-05, "loss": 0.4032, "mean_token_accuracy": 0.8605299115180969, "step": 905 }, { "epoch": 1.262135922330097, "grad_norm": 0.16484476625919342, "learning_rate": 3.803468208092486e-05, "loss": 0.3987, "mean_token_accuracy": 0.8620869636535644, "step": 910 }, { "epoch": 1.2690707350901524, "grad_norm": 0.15530748665332794, "learning_rate": 3.793834296724471e-05, "loss": 0.4019, "mean_token_accuracy": 0.8608452916145325, "step": 915 }, { "epoch": 1.276005547850208, "grad_norm": 0.16284696757793427, "learning_rate": 3.784200385356455e-05, "loss": 0.4056, "mean_token_accuracy": 0.8598546504974365, "step": 920 }, { "epoch": 1.2829403606102634, "grad_norm": 0.15156075358390808, "learning_rate": 3.774566473988439e-05, "loss": 0.4148, "mean_token_accuracy": 0.8573850750923157, "step": 925 }, { "epoch": 1.2898751733703189, "grad_norm": 0.18044961988925934, "learning_rate": 3.764932562620424e-05, "loss": 0.4165, "mean_token_accuracy": 0.8561815977096557, "step": 930 }, { "epoch": 1.2968099861303743, "grad_norm": 0.1658436506986618, "learning_rate": 3.7552986512524084e-05, "loss": 0.4056, "mean_token_accuracy": 0.8600709080696106, "step": 935 }, { "epoch": 1.3037447988904298, "grad_norm": 0.16520382463932037, "learning_rate": 3.7456647398843934e-05, "loss": 0.4148, "mean_token_accuracy": 0.8568984508514405, "step": 940 }, { "epoch": 1.3106796116504853, "grad_norm": 0.1799880713224411, "learning_rate": 3.736030828516378e-05, "loss": 0.4188, "mean_token_accuracy": 0.856162166595459, "step": 945 }, { "epoch": 1.317614424410541, "grad_norm": 0.16812920570373535, "learning_rate": 3.7263969171483626e-05, "loss": 0.4111, "mean_token_accuracy": 0.8584610104560852, "step": 950 }, { "epoch": 1.3245492371705965, "grad_norm": 0.15165302157402039, "learning_rate": 3.716763005780347e-05, "loss": 0.404, "mean_token_accuracy": 0.860293960571289, "step": 955 }, { "epoch": 1.331484049930652, "grad_norm": 0.13624367117881775, "learning_rate": 3.707129094412332e-05, "loss": 0.4136, "mean_token_accuracy": 0.8572997689247132, "step": 960 }, { "epoch": 1.3384188626907074, "grad_norm": 0.6036350131034851, "learning_rate": 3.697495183044316e-05, "loss": 0.4139, "mean_token_accuracy": 0.8579724669456482, "step": 965 }, { "epoch": 1.345353675450763, "grad_norm": 0.16172119975090027, "learning_rate": 3.6878612716763e-05, "loss": 0.4194, "mean_token_accuracy": 0.8556413412094116, "step": 970 }, { "epoch": 1.3522884882108184, "grad_norm": 0.13519282639026642, "learning_rate": 3.678227360308285e-05, "loss": 0.4053, "mean_token_accuracy": 0.8597145199775695, "step": 975 }, { "epoch": 1.3592233009708738, "grad_norm": 0.14305779337882996, "learning_rate": 3.66859344894027e-05, "loss": 0.4066, "mean_token_accuracy": 0.8596665501594544, "step": 980 }, { "epoch": 1.3661581137309293, "grad_norm": 0.18043436110019684, "learning_rate": 3.6589595375722544e-05, "loss": 0.4061, "mean_token_accuracy": 0.8598026871681214, "step": 985 }, { "epoch": 1.3730929264909848, "grad_norm": 0.12696458399295807, "learning_rate": 3.649325626204239e-05, "loss": 0.4035, "mean_token_accuracy": 0.860295832157135, "step": 990 }, { "epoch": 1.3800277392510403, "grad_norm": 0.15299014747142792, "learning_rate": 3.6396917148362236e-05, "loss": 0.4045, "mean_token_accuracy": 0.8601055145263672, "step": 995 }, { "epoch": 1.3869625520110958, "grad_norm": 0.14797343313694, "learning_rate": 3.630057803468208e-05, "loss": 0.4064, "mean_token_accuracy": 0.8599012613296508, "step": 1000 } ], "logging_steps": 5, "max_steps": 2884, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2953357891076096e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }