| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.3869625520110958, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006934812760055479, | |
| "grad_norm": 61.60813903808594, | |
| "learning_rate": 8.650519031141869e-07, | |
| "loss": 2.7928, | |
| "mean_token_accuracy": 0.6783367753028869, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.013869625520110958, | |
| "grad_norm": 35.546016693115234, | |
| "learning_rate": 1.7301038062283738e-06, | |
| "loss": 2.3942, | |
| "mean_token_accuracy": 0.6943186521530151, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.020804438280166437, | |
| "grad_norm": 2.246945858001709, | |
| "learning_rate": 2.5951557093425604e-06, | |
| "loss": 1.202, | |
| "mean_token_accuracy": 0.7397322177886962, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.027739251040221916, | |
| "grad_norm": 1.1429805755615234, | |
| "learning_rate": 3.4602076124567477e-06, | |
| "loss": 0.918, | |
| "mean_token_accuracy": 0.7564186692237854, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03467406380027739, | |
| "grad_norm": 0.9538511633872986, | |
| "learning_rate": 4.325259515570934e-06, | |
| "loss": 0.8104, | |
| "mean_token_accuracy": 0.7724308490753173, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04160887656033287, | |
| "grad_norm": 0.7069241404533386, | |
| "learning_rate": 5.190311418685121e-06, | |
| "loss": 0.7364, | |
| "mean_token_accuracy": 0.7827559828758239, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04854368932038835, | |
| "grad_norm": 0.4030636250972748, | |
| "learning_rate": 6.055363321799308e-06, | |
| "loss": 0.6835, | |
| "mean_token_accuracy": 0.7935511350631714, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.05547850208044383, | |
| "grad_norm": 0.8142576217651367, | |
| "learning_rate": 6.920415224913495e-06, | |
| "loss": 0.6478, | |
| "mean_token_accuracy": 0.8010085463523865, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06241331484049931, | |
| "grad_norm": 0.2626665532588959, | |
| "learning_rate": 7.785467128027681e-06, | |
| "loss": 0.6267, | |
| "mean_token_accuracy": 0.8053073883056641, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.06934812760055478, | |
| "grad_norm": 0.23942551016807556, | |
| "learning_rate": 8.650519031141868e-06, | |
| "loss": 0.6013, | |
| "mean_token_accuracy": 0.8112802267074585, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07628294036061026, | |
| "grad_norm": 0.20308136940002441, | |
| "learning_rate": 9.515570934256055e-06, | |
| "loss": 0.5769, | |
| "mean_token_accuracy": 0.8168688178062439, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.08321775312066575, | |
| "grad_norm": 0.1854431927204132, | |
| "learning_rate": 1.0380622837370241e-05, | |
| "loss": 0.5805, | |
| "mean_token_accuracy": 0.815436840057373, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09015256588072122, | |
| "grad_norm": 0.1700541228055954, | |
| "learning_rate": 1.124567474048443e-05, | |
| "loss": 0.5652, | |
| "mean_token_accuracy": 0.8188095331192017, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0970873786407767, | |
| "grad_norm": 0.18573108315467834, | |
| "learning_rate": 1.2110726643598615e-05, | |
| "loss": 0.5524, | |
| "mean_token_accuracy": 0.8222507953643798, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10402219140083217, | |
| "grad_norm": 0.18843185901641846, | |
| "learning_rate": 1.2975778546712803e-05, | |
| "loss": 0.542, | |
| "mean_token_accuracy": 0.8249342203140259, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.11095700416088766, | |
| "grad_norm": 0.21635942161083221, | |
| "learning_rate": 1.384083044982699e-05, | |
| "loss": 0.5401, | |
| "mean_token_accuracy": 0.8251730322837829, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.11789181692094314, | |
| "grad_norm": 0.21325534582138062, | |
| "learning_rate": 1.4705882352941177e-05, | |
| "loss": 0.5404, | |
| "mean_token_accuracy": 0.8243620276451111, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.12482662968099861, | |
| "grad_norm": 0.22691610455513, | |
| "learning_rate": 1.5570934256055363e-05, | |
| "loss": 0.5278, | |
| "mean_token_accuracy": 0.8283108592033386, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1317614424410541, | |
| "grad_norm": 0.2375083714723587, | |
| "learning_rate": 1.643598615916955e-05, | |
| "loss": 0.5198, | |
| "mean_token_accuracy": 0.8296842217445374, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.13869625520110956, | |
| "grad_norm": 0.27802157402038574, | |
| "learning_rate": 1.7301038062283735e-05, | |
| "loss": 0.5346, | |
| "mean_token_accuracy": 0.825625765323639, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14563106796116504, | |
| "grad_norm": 0.3691716194152832, | |
| "learning_rate": 1.8166089965397926e-05, | |
| "loss": 0.5249, | |
| "mean_token_accuracy": 0.828523588180542, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.15256588072122051, | |
| "grad_norm": 0.30235254764556885, | |
| "learning_rate": 1.903114186851211e-05, | |
| "loss": 0.514, | |
| "mean_token_accuracy": 0.8320501446723938, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.15950069348127602, | |
| "grad_norm": 0.3447076380252838, | |
| "learning_rate": 1.98961937716263e-05, | |
| "loss": 0.5203, | |
| "mean_token_accuracy": 0.8298335313796997, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1664355062413315, | |
| "grad_norm": 0.28185489773750305, | |
| "learning_rate": 2.0761245674740483e-05, | |
| "loss": 0.5305, | |
| "mean_token_accuracy": 0.8264262914657593, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.17337031900138697, | |
| "grad_norm": 0.2629449963569641, | |
| "learning_rate": 2.1626297577854674e-05, | |
| "loss": 0.5167, | |
| "mean_token_accuracy": 0.8303680658340454, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.18030513176144244, | |
| "grad_norm": 0.2788124680519104, | |
| "learning_rate": 2.249134948096886e-05, | |
| "loss": 0.5275, | |
| "mean_token_accuracy": 0.8274267673492431, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.18723994452149792, | |
| "grad_norm": 0.24550440907478333, | |
| "learning_rate": 2.3356401384083046e-05, | |
| "loss": 0.5073, | |
| "mean_token_accuracy": 0.8328269720077515, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.1941747572815534, | |
| "grad_norm": 0.7636565566062927, | |
| "learning_rate": 2.422145328719723e-05, | |
| "loss": 0.5138, | |
| "mean_token_accuracy": 0.8308726906776428, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.20110957004160887, | |
| "grad_norm": 0.6163385510444641, | |
| "learning_rate": 2.508650519031142e-05, | |
| "loss": 0.5123, | |
| "mean_token_accuracy": 0.8311893105506897, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.20804438280166435, | |
| "grad_norm": 0.3808706998825073, | |
| "learning_rate": 2.5951557093425606e-05, | |
| "loss": 0.5018, | |
| "mean_token_accuracy": 0.8343647360801697, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.21497919556171982, | |
| "grad_norm": 0.2565021216869354, | |
| "learning_rate": 2.6816608996539794e-05, | |
| "loss": 0.4943, | |
| "mean_token_accuracy": 0.8362817883491516, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.22191400832177532, | |
| "grad_norm": 0.3511087894439697, | |
| "learning_rate": 2.768166089965398e-05, | |
| "loss": 0.4933, | |
| "mean_token_accuracy": 0.8366880297660828, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2288488210818308, | |
| "grad_norm": 0.4006827175617218, | |
| "learning_rate": 2.8546712802768166e-05, | |
| "loss": 0.4906, | |
| "mean_token_accuracy": 0.837351131439209, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.23578363384188628, | |
| "grad_norm": 0.4149070978164673, | |
| "learning_rate": 2.9411764705882354e-05, | |
| "loss": 0.4988, | |
| "mean_token_accuracy": 0.8351827621459961, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.24271844660194175, | |
| "grad_norm": 0.32881560921669006, | |
| "learning_rate": 3.0276816608996538e-05, | |
| "loss": 0.5085, | |
| "mean_token_accuracy": 0.831884253025055, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.24965325936199723, | |
| "grad_norm": 0.46366971731185913, | |
| "learning_rate": 3.1141868512110726e-05, | |
| "loss": 0.4964, | |
| "mean_token_accuracy": 0.8355090618133545, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2565880721220527, | |
| "grad_norm": 0.838777482509613, | |
| "learning_rate": 3.200692041522492e-05, | |
| "loss": 0.5078, | |
| "mean_token_accuracy": 0.8320568442344666, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2635228848821082, | |
| "grad_norm": 30.57767677307129, | |
| "learning_rate": 3.28719723183391e-05, | |
| "loss": 0.4978, | |
| "mean_token_accuracy": 0.8360116839408874, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.27045769764216365, | |
| "grad_norm": 0.5885879993438721, | |
| "learning_rate": 3.373702422145329e-05, | |
| "loss": 0.497, | |
| "mean_token_accuracy": 0.8356186389923096, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.27739251040221913, | |
| "grad_norm": 0.3610420525074005, | |
| "learning_rate": 3.460207612456747e-05, | |
| "loss": 0.4989, | |
| "mean_token_accuracy": 0.8350513100624084, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2843273231622746, | |
| "grad_norm": 0.3954995572566986, | |
| "learning_rate": 3.546712802768166e-05, | |
| "loss": 0.5011, | |
| "mean_token_accuracy": 0.83415766954422, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.2912621359223301, | |
| "grad_norm": 0.3071337342262268, | |
| "learning_rate": 3.633217993079585e-05, | |
| "loss": 0.5255, | |
| "mean_token_accuracy": 0.8285403490066529, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.29819694868238555, | |
| "grad_norm": 0.31758391857147217, | |
| "learning_rate": 3.719723183391004e-05, | |
| "loss": 0.4954, | |
| "mean_token_accuracy": 0.835390031337738, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.30513176144244103, | |
| "grad_norm": 0.3296087086200714, | |
| "learning_rate": 3.806228373702422e-05, | |
| "loss": 0.4923, | |
| "mean_token_accuracy": 0.8361375451087951, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3120665742024965, | |
| "grad_norm": 0.2980894446372986, | |
| "learning_rate": 3.892733564013841e-05, | |
| "loss": 0.4996, | |
| "mean_token_accuracy": 0.8341476917266846, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.31900138696255204, | |
| "grad_norm": 0.2892495095729828, | |
| "learning_rate": 3.97923875432526e-05, | |
| "loss": 0.4855, | |
| "mean_token_accuracy": 0.8382086515426636, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3259361997226075, | |
| "grad_norm": 0.29287102818489075, | |
| "learning_rate": 4.065743944636679e-05, | |
| "loss": 0.4944, | |
| "mean_token_accuracy": 0.8353524923324585, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.332871012482663, | |
| "grad_norm": 0.28245487809181213, | |
| "learning_rate": 4.1522491349480966e-05, | |
| "loss": 0.4887, | |
| "mean_token_accuracy": 0.8373544692993165, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.33980582524271846, | |
| "grad_norm": 0.23551802337169647, | |
| "learning_rate": 4.238754325259516e-05, | |
| "loss": 0.4925, | |
| "mean_token_accuracy": 0.8361364006996155, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.34674063800277394, | |
| "grad_norm": 0.24266427755355835, | |
| "learning_rate": 4.325259515570935e-05, | |
| "loss": 0.4759, | |
| "mean_token_accuracy": 0.8410738468170166, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3536754507628294, | |
| "grad_norm": 0.33316895365715027, | |
| "learning_rate": 4.411764705882353e-05, | |
| "loss": 0.4902, | |
| "mean_token_accuracy": 0.8370885252952576, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.3606102635228849, | |
| "grad_norm": 0.5113539099693298, | |
| "learning_rate": 4.498269896193772e-05, | |
| "loss": 0.4918, | |
| "mean_token_accuracy": 0.8364068984985351, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.36754507628294036, | |
| "grad_norm": 0.3733905851840973, | |
| "learning_rate": 4.58477508650519e-05, | |
| "loss": 0.49, | |
| "mean_token_accuracy": 0.8370036244392395, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.37447988904299584, | |
| "grad_norm": 0.4112997353076935, | |
| "learning_rate": 4.671280276816609e-05, | |
| "loss": 0.4932, | |
| "mean_token_accuracy": 0.8356328129768371, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3814147018030513, | |
| "grad_norm": 0.5121487379074097, | |
| "learning_rate": 4.7577854671280283e-05, | |
| "loss": 0.479, | |
| "mean_token_accuracy": 0.839626955986023, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3883495145631068, | |
| "grad_norm": 0.36294957995414734, | |
| "learning_rate": 4.844290657439446e-05, | |
| "loss": 0.4829, | |
| "mean_token_accuracy": 0.8391167283058166, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.39528432732316227, | |
| "grad_norm": 0.3162820339202881, | |
| "learning_rate": 4.930795847750865e-05, | |
| "loss": 0.4899, | |
| "mean_token_accuracy": 0.8368083834648132, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.40221914008321774, | |
| "grad_norm": 0.3973437547683716, | |
| "learning_rate": 4.9980732177263974e-05, | |
| "loss": 0.4864, | |
| "mean_token_accuracy": 0.8374906539916992, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4091539528432732, | |
| "grad_norm": 0.5423433184623718, | |
| "learning_rate": 4.9884393063583816e-05, | |
| "loss": 0.4907, | |
| "mean_token_accuracy": 0.8373413920402527, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4160887656033287, | |
| "grad_norm": 0.39722123742103577, | |
| "learning_rate": 4.9788053949903666e-05, | |
| "loss": 0.4961, | |
| "mean_token_accuracy": 0.8351489901542664, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.42302357836338417, | |
| "grad_norm": 0.34169071912765503, | |
| "learning_rate": 4.969171483622351e-05, | |
| "loss": 0.4891, | |
| "mean_token_accuracy": 0.8370493412017822, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.42995839112343964, | |
| "grad_norm": 0.3429335951805115, | |
| "learning_rate": 4.959537572254335e-05, | |
| "loss": 0.4794, | |
| "mean_token_accuracy": 0.8396916627883911, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4368932038834951, | |
| "grad_norm": 0.3266272246837616, | |
| "learning_rate": 4.94990366088632e-05, | |
| "loss": 0.4757, | |
| "mean_token_accuracy": 0.8405494570732117, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.44382801664355065, | |
| "grad_norm": 0.2874930202960968, | |
| "learning_rate": 4.940269749518305e-05, | |
| "loss": 0.4978, | |
| "mean_token_accuracy": 0.8344841122627258, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4507628294036061, | |
| "grad_norm": 0.2812349498271942, | |
| "learning_rate": 4.930635838150289e-05, | |
| "loss": 0.4839, | |
| "mean_token_accuracy": 0.8383953332901001, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.4576976421636616, | |
| "grad_norm": 0.25296345353126526, | |
| "learning_rate": 4.921001926782274e-05, | |
| "loss": 0.4738, | |
| "mean_token_accuracy": 0.8412886261940002, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4646324549237171, | |
| "grad_norm": 0.22165291011333466, | |
| "learning_rate": 4.9113680154142584e-05, | |
| "loss": 0.4867, | |
| "mean_token_accuracy": 0.8379201173782349, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.47156726768377255, | |
| "grad_norm": 0.2551758289337158, | |
| "learning_rate": 4.9017341040462426e-05, | |
| "loss": 0.4786, | |
| "mean_token_accuracy": 0.8399594306945801, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.478502080443828, | |
| "grad_norm": 0.25708919763565063, | |
| "learning_rate": 4.8921001926782276e-05, | |
| "loss": 0.48, | |
| "mean_token_accuracy": 0.8395551085472107, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.4854368932038835, | |
| "grad_norm": 0.1992408186197281, | |
| "learning_rate": 4.8824662813102125e-05, | |
| "loss": 0.4714, | |
| "mean_token_accuracy": 0.8418668508529663, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.492371705963939, | |
| "grad_norm": 0.23445720970630646, | |
| "learning_rate": 4.872832369942197e-05, | |
| "loss": 0.471, | |
| "mean_token_accuracy": 0.8421580553054809, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.49930651872399445, | |
| "grad_norm": 0.31462928652763367, | |
| "learning_rate": 4.863198458574181e-05, | |
| "loss": 0.4711, | |
| "mean_token_accuracy": 0.842027747631073, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5062413314840499, | |
| "grad_norm": 0.24767646193504333, | |
| "learning_rate": 4.853564547206166e-05, | |
| "loss": 0.4717, | |
| "mean_token_accuracy": 0.8417503118515015, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5131761442441054, | |
| "grad_norm": 0.2389938235282898, | |
| "learning_rate": 4.84393063583815e-05, | |
| "loss": 0.4677, | |
| "mean_token_accuracy": 0.8431912064552307, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5201109570041609, | |
| "grad_norm": 0.29998722672462463, | |
| "learning_rate": 4.834296724470135e-05, | |
| "loss": 0.4877, | |
| "mean_token_accuracy": 0.8374402284622192, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5270457697642164, | |
| "grad_norm": 0.2877121865749359, | |
| "learning_rate": 4.82466281310212e-05, | |
| "loss": 0.4863, | |
| "mean_token_accuracy": 0.8380719065666199, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5339805825242718, | |
| "grad_norm": 0.24628062546253204, | |
| "learning_rate": 4.815028901734104e-05, | |
| "loss": 0.4665, | |
| "mean_token_accuracy": 0.8434135437011718, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.5409153952843273, | |
| "grad_norm": 0.24347947537899017, | |
| "learning_rate": 4.8053949903660886e-05, | |
| "loss": 0.476, | |
| "mean_token_accuracy": 0.8404138565063477, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5478502080443828, | |
| "grad_norm": 0.20724909007549286, | |
| "learning_rate": 4.7957610789980735e-05, | |
| "loss": 0.4881, | |
| "mean_token_accuracy": 0.8372583389282227, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.5547850208044383, | |
| "grad_norm": 0.2162594497203827, | |
| "learning_rate": 4.786127167630058e-05, | |
| "loss": 0.4726, | |
| "mean_token_accuracy": 0.842011570930481, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5617198335644937, | |
| "grad_norm": 0.34494099020957947, | |
| "learning_rate": 4.776493256262042e-05, | |
| "loss": 0.4783, | |
| "mean_token_accuracy": 0.8399308085441589, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.5686546463245492, | |
| "grad_norm": 0.24402566254138947, | |
| "learning_rate": 4.7668593448940276e-05, | |
| "loss": 0.4953, | |
| "mean_token_accuracy": 0.8352864623069763, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5755894590846047, | |
| "grad_norm": 0.2124612033367157, | |
| "learning_rate": 4.757225433526012e-05, | |
| "loss": 0.4849, | |
| "mean_token_accuracy": 0.8380748987197876, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.5825242718446602, | |
| "grad_norm": 0.20577934384346008, | |
| "learning_rate": 4.747591522157996e-05, | |
| "loss": 0.4591, | |
| "mean_token_accuracy": 0.845665693283081, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5894590846047156, | |
| "grad_norm": 0.2838655710220337, | |
| "learning_rate": 4.737957610789981e-05, | |
| "loss": 0.4709, | |
| "mean_token_accuracy": 0.8418583750724793, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.5963938973647711, | |
| "grad_norm": 0.2222902923822403, | |
| "learning_rate": 4.7283236994219653e-05, | |
| "loss": 0.4817, | |
| "mean_token_accuracy": 0.8388337612152099, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6033287101248266, | |
| "grad_norm": 0.25565460324287415, | |
| "learning_rate": 4.7186897880539496e-05, | |
| "loss": 0.4724, | |
| "mean_token_accuracy": 0.8415215969085693, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6102635228848821, | |
| "grad_norm": 0.680081844329834, | |
| "learning_rate": 4.709055876685935e-05, | |
| "loss": 0.4777, | |
| "mean_token_accuracy": 0.8402902245521545, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6171983356449375, | |
| "grad_norm": 0.3035682141780853, | |
| "learning_rate": 4.6994219653179195e-05, | |
| "loss": 0.4749, | |
| "mean_token_accuracy": 0.8405117988586426, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.624133148404993, | |
| "grad_norm": 0.22393807768821716, | |
| "learning_rate": 4.689788053949904e-05, | |
| "loss": 0.4735, | |
| "mean_token_accuracy": 0.8410566568374633, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6310679611650486, | |
| "grad_norm": 0.23452860116958618, | |
| "learning_rate": 4.6801541425818887e-05, | |
| "loss": 0.4798, | |
| "mean_token_accuracy": 0.8394344925880433, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.6380027739251041, | |
| "grad_norm": 0.21135355532169342, | |
| "learning_rate": 4.670520231213873e-05, | |
| "loss": 0.4783, | |
| "mean_token_accuracy": 0.8398800015449523, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6449375866851595, | |
| "grad_norm": 0.2495516985654831, | |
| "learning_rate": 4.660886319845857e-05, | |
| "loss": 0.4769, | |
| "mean_token_accuracy": 0.8407980084419251, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.651872399445215, | |
| "grad_norm": 0.25724372267723083, | |
| "learning_rate": 4.651252408477843e-05, | |
| "loss": 0.4764, | |
| "mean_token_accuracy": 0.8402070879936219, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6588072122052705, | |
| "grad_norm": 0.28974995017051697, | |
| "learning_rate": 4.641618497109827e-05, | |
| "loss": 0.468, | |
| "mean_token_accuracy": 0.8425545215606689, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.665742024965326, | |
| "grad_norm": 0.26298555731773376, | |
| "learning_rate": 4.631984585741811e-05, | |
| "loss": 0.4752, | |
| "mean_token_accuracy": 0.8405273199081421, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6726768377253814, | |
| "grad_norm": 0.3188522756099701, | |
| "learning_rate": 4.622350674373796e-05, | |
| "loss": 0.4683, | |
| "mean_token_accuracy": 0.8426392555236817, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.6796116504854369, | |
| "grad_norm": 0.2528276741504669, | |
| "learning_rate": 4.6127167630057805e-05, | |
| "loss": 0.4753, | |
| "mean_token_accuracy": 0.840662169456482, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6865464632454924, | |
| "grad_norm": 0.3695737421512604, | |
| "learning_rate": 4.603082851637765e-05, | |
| "loss": 0.501, | |
| "mean_token_accuracy": 0.8371694445610046, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.6934812760055479, | |
| "grad_norm": 0.31206727027893066, | |
| "learning_rate": 4.59344894026975e-05, | |
| "loss": 0.478, | |
| "mean_token_accuracy": 0.8401562452316285, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7004160887656034, | |
| "grad_norm": 3.478522539138794, | |
| "learning_rate": 4.5838150289017346e-05, | |
| "loss": 0.49, | |
| "mean_token_accuracy": 0.8365014433860779, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7073509015256588, | |
| "grad_norm": 0.4430016875267029, | |
| "learning_rate": 4.574181117533719e-05, | |
| "loss": 0.47, | |
| "mean_token_accuracy": 0.8422938823699951, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.28713470697402954, | |
| "learning_rate": 4.564547206165704e-05, | |
| "loss": 0.4786, | |
| "mean_token_accuracy": 0.8401166200637817, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.7212205270457698, | |
| "grad_norm": 0.2158370316028595, | |
| "learning_rate": 4.554913294797688e-05, | |
| "loss": 0.4703, | |
| "mean_token_accuracy": 0.8421276092529297, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7281553398058253, | |
| "grad_norm": 0.2426484376192093, | |
| "learning_rate": 4.545279383429672e-05, | |
| "loss": 0.469, | |
| "mean_token_accuracy": 0.8426563143730164, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.7350901525658807, | |
| "grad_norm": 0.27153995633125305, | |
| "learning_rate": 4.535645472061657e-05, | |
| "loss": 0.4754, | |
| "mean_token_accuracy": 0.8406094431877136, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7420249653259362, | |
| "grad_norm": 0.1991535872220993, | |
| "learning_rate": 4.526011560693642e-05, | |
| "loss": 0.4782, | |
| "mean_token_accuracy": 0.8397158980369568, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.7489597780859917, | |
| "grad_norm": 0.15923242270946503, | |
| "learning_rate": 4.5163776493256264e-05, | |
| "loss": 0.4563, | |
| "mean_token_accuracy": 0.8461790800094604, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7558945908460472, | |
| "grad_norm": 0.18306083977222443, | |
| "learning_rate": 4.5067437379576114e-05, | |
| "loss": 0.4791, | |
| "mean_token_accuracy": 0.8393635034561158, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.7628294036061026, | |
| "grad_norm": 0.24309256672859192, | |
| "learning_rate": 4.4971098265895956e-05, | |
| "loss": 0.4777, | |
| "mean_token_accuracy": 0.8401144862174987, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7697642163661581, | |
| "grad_norm": 0.20910784602165222, | |
| "learning_rate": 4.48747591522158e-05, | |
| "loss": 0.4728, | |
| "mean_token_accuracy": 0.8417426466941833, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.7766990291262136, | |
| "grad_norm": 0.1896984726190567, | |
| "learning_rate": 4.477842003853565e-05, | |
| "loss": 0.4557, | |
| "mean_token_accuracy": 0.8461586833000183, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7836338418862691, | |
| "grad_norm": 0.18798613548278809, | |
| "learning_rate": 4.46820809248555e-05, | |
| "loss": 0.457, | |
| "mean_token_accuracy": 0.8459754705429077, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.7905686546463245, | |
| "grad_norm": 0.18959036469459534, | |
| "learning_rate": 4.458574181117534e-05, | |
| "loss": 0.4633, | |
| "mean_token_accuracy": 0.8437102913856507, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.79750346740638, | |
| "grad_norm": 0.16292130947113037, | |
| "learning_rate": 4.448940269749519e-05, | |
| "loss": 0.4749, | |
| "mean_token_accuracy": 0.8404599308967591, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.8044382801664355, | |
| "grad_norm": 0.17686040699481964, | |
| "learning_rate": 4.439306358381503e-05, | |
| "loss": 0.4601, | |
| "mean_token_accuracy": 0.844899308681488, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.811373092926491, | |
| "grad_norm": 0.1865614652633667, | |
| "learning_rate": 4.4296724470134875e-05, | |
| "loss": 0.4533, | |
| "mean_token_accuracy": 0.846677553653717, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8183079056865464, | |
| "grad_norm": 0.2037810981273651, | |
| "learning_rate": 4.4200385356454724e-05, | |
| "loss": 0.4575, | |
| "mean_token_accuracy": 0.8457266449928283, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8252427184466019, | |
| "grad_norm": 0.16701985895633698, | |
| "learning_rate": 4.4104046242774566e-05, | |
| "loss": 0.466, | |
| "mean_token_accuracy": 0.8428797006607056, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.8321775312066574, | |
| "grad_norm": 0.19714096188545227, | |
| "learning_rate": 4.4007707129094416e-05, | |
| "loss": 0.4696, | |
| "mean_token_accuracy": 0.8422728657722474, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8391123439667129, | |
| "grad_norm": 0.20772860944271088, | |
| "learning_rate": 4.391136801541426e-05, | |
| "loss": 0.4635, | |
| "mean_token_accuracy": 0.8438523054122925, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.8460471567267683, | |
| "grad_norm": 0.35546374320983887, | |
| "learning_rate": 4.381502890173411e-05, | |
| "loss": 0.4665, | |
| "mean_token_accuracy": 0.8430918097496033, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8529819694868238, | |
| "grad_norm": 0.19986563920974731, | |
| "learning_rate": 4.371868978805395e-05, | |
| "loss": 0.4742, | |
| "mean_token_accuracy": 0.8409379243850708, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.8599167822468793, | |
| "grad_norm": 0.4013294279575348, | |
| "learning_rate": 4.36223506743738e-05, | |
| "loss": 0.4673, | |
| "mean_token_accuracy": 0.8426662087440491, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8668515950069348, | |
| "grad_norm": 0.29566317796707153, | |
| "learning_rate": 4.352601156069364e-05, | |
| "loss": 0.4837, | |
| "mean_token_accuracy": 0.8380556702613831, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.8737864077669902, | |
| "grad_norm": 0.24461045861244202, | |
| "learning_rate": 4.342967244701349e-05, | |
| "loss": 0.4648, | |
| "mean_token_accuracy": 0.8434231281280518, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8807212205270458, | |
| "grad_norm": 0.2197730541229248, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 0.4585, | |
| "mean_token_accuracy": 0.8448979973793029, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.8876560332871013, | |
| "grad_norm": 0.22158759832382202, | |
| "learning_rate": 4.3236994219653183e-05, | |
| "loss": 0.4678, | |
| "mean_token_accuracy": 0.8427410125732422, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8945908460471568, | |
| "grad_norm": 0.17014814913272858, | |
| "learning_rate": 4.3140655105973026e-05, | |
| "loss": 0.4706, | |
| "mean_token_accuracy": 0.8416074395179749, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.9015256588072122, | |
| "grad_norm": 0.22929687798023224, | |
| "learning_rate": 4.304431599229287e-05, | |
| "loss": 0.4753, | |
| "mean_token_accuracy": 0.8403880834579468, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9084604715672677, | |
| "grad_norm": 0.20894835889339447, | |
| "learning_rate": 4.294797687861272e-05, | |
| "loss": 0.4734, | |
| "mean_token_accuracy": 0.8410162568092346, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.9153952843273232, | |
| "grad_norm": 0.18031327426433563, | |
| "learning_rate": 4.285163776493257e-05, | |
| "loss": 0.4536, | |
| "mean_token_accuracy": 0.8469532251358032, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.9223300970873787, | |
| "grad_norm": 0.17288991808891296, | |
| "learning_rate": 4.275529865125241e-05, | |
| "loss": 0.4611, | |
| "mean_token_accuracy": 0.8443895936012268, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.9292649098474342, | |
| "grad_norm": 0.1980760544538498, | |
| "learning_rate": 4.265895953757226e-05, | |
| "loss": 0.484, | |
| "mean_token_accuracy": 0.8379009962081909, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.9361997226074896, | |
| "grad_norm": 0.20848602056503296, | |
| "learning_rate": 4.25626204238921e-05, | |
| "loss": 0.4771, | |
| "mean_token_accuracy": 0.8398370265960693, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.9431345353675451, | |
| "grad_norm": 0.1636408418416977, | |
| "learning_rate": 4.2466281310211944e-05, | |
| "loss": 0.4578, | |
| "mean_token_accuracy": 0.845670223236084, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9500693481276006, | |
| "grad_norm": 0.22376923263072968, | |
| "learning_rate": 4.2369942196531794e-05, | |
| "loss": 0.4652, | |
| "mean_token_accuracy": 0.8431706428527832, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.957004160887656, | |
| "grad_norm": 0.21399416029453278, | |
| "learning_rate": 4.2273603082851636e-05, | |
| "loss": 0.4537, | |
| "mean_token_accuracy": 0.8464810252189636, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9639389736477115, | |
| "grad_norm": 2.5790159702301025, | |
| "learning_rate": 4.2177263969171485e-05, | |
| "loss": 0.4754, | |
| "mean_token_accuracy": 0.8421392440795898, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.970873786407767, | |
| "grad_norm": 0.2648729085922241, | |
| "learning_rate": 4.2080924855491335e-05, | |
| "loss": 0.469, | |
| "mean_token_accuracy": 0.8423485517501831, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9778085991678225, | |
| "grad_norm": 0.20691435039043427, | |
| "learning_rate": 4.198458574181118e-05, | |
| "loss": 0.4534, | |
| "mean_token_accuracy": 0.8466127276420593, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.984743411927878, | |
| "grad_norm": 0.2122969925403595, | |
| "learning_rate": 4.188824662813102e-05, | |
| "loss": 0.4744, | |
| "mean_token_accuracy": 0.843373692035675, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9916782246879334, | |
| "grad_norm": 0.18356889486312866, | |
| "learning_rate": 4.179190751445087e-05, | |
| "loss": 0.4735, | |
| "mean_token_accuracy": 0.840711236000061, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.9986130374479889, | |
| "grad_norm": 0.2710322141647339, | |
| "learning_rate": 4.169556840077071e-05, | |
| "loss": 0.4893, | |
| "mean_token_accuracy": 0.8400939464569092, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.0055478502080444, | |
| "grad_norm": 0.28685542941093445, | |
| "learning_rate": 4.159922928709056e-05, | |
| "loss": 0.4413, | |
| "mean_token_accuracy": 0.8504527807235718, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.0124826629680999, | |
| "grad_norm": 0.24674533307552338, | |
| "learning_rate": 4.150289017341041e-05, | |
| "loss": 0.4098, | |
| "mean_token_accuracy": 0.8587909460067749, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.0194174757281553, | |
| "grad_norm": 0.21753250062465668, | |
| "learning_rate": 4.140655105973025e-05, | |
| "loss": 0.3935, | |
| "mean_token_accuracy": 0.8634741544723511, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.0263522884882108, | |
| "grad_norm": 0.20492789149284363, | |
| "learning_rate": 4.1310211946050096e-05, | |
| "loss": 0.4068, | |
| "mean_token_accuracy": 0.8603113770484925, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.0332871012482663, | |
| "grad_norm": 0.16923396289348602, | |
| "learning_rate": 4.1213872832369945e-05, | |
| "loss": 0.4185, | |
| "mean_token_accuracy": 0.8562816023826599, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.0402219140083218, | |
| "grad_norm": 0.18504321575164795, | |
| "learning_rate": 4.111753371868979e-05, | |
| "loss": 0.4083, | |
| "mean_token_accuracy": 0.8588919401168823, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.0471567267683772, | |
| "grad_norm": 0.15754340589046478, | |
| "learning_rate": 4.102119460500964e-05, | |
| "loss": 0.4001, | |
| "mean_token_accuracy": 0.8617303729057312, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.0540915395284327, | |
| "grad_norm": 0.16705656051635742, | |
| "learning_rate": 4.0924855491329486e-05, | |
| "loss": 0.4124, | |
| "mean_token_accuracy": 0.8577338337898255, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.0610263522884882, | |
| "grad_norm": 0.1913621723651886, | |
| "learning_rate": 4.082851637764933e-05, | |
| "loss": 0.4027, | |
| "mean_token_accuracy": 0.860604989528656, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.0679611650485437, | |
| "grad_norm": 0.1807246059179306, | |
| "learning_rate": 4.073217726396917e-05, | |
| "loss": 0.4154, | |
| "mean_token_accuracy": 0.8569133520126343, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.0748959778085991, | |
| "grad_norm": 0.16904115676879883, | |
| "learning_rate": 4.063583815028902e-05, | |
| "loss": 0.4043, | |
| "mean_token_accuracy": 0.8603330969810485, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.0818307905686546, | |
| "grad_norm": 0.13820037245750427, | |
| "learning_rate": 4.053949903660886e-05, | |
| "loss": 0.4032, | |
| "mean_token_accuracy": 0.8607820630073547, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.08876560332871, | |
| "grad_norm": 0.15458045899868011, | |
| "learning_rate": 4.0443159922928706e-05, | |
| "loss": 0.4126, | |
| "mean_token_accuracy": 0.8577965140342713, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.0957004160887656, | |
| "grad_norm": 0.14621621370315552, | |
| "learning_rate": 4.034682080924856e-05, | |
| "loss": 0.4164, | |
| "mean_token_accuracy": 0.8568703651428222, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.102635228848821, | |
| "grad_norm": 0.22418245673179626, | |
| "learning_rate": 4.0250481695568404e-05, | |
| "loss": 0.4151, | |
| "mean_token_accuracy": 0.8572113513946533, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.1095700416088765, | |
| "grad_norm": 0.18166805803775787, | |
| "learning_rate": 4.015414258188825e-05, | |
| "loss": 0.4236, | |
| "mean_token_accuracy": 0.8545473575592041, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.116504854368932, | |
| "grad_norm": 0.19410911202430725, | |
| "learning_rate": 4.0057803468208096e-05, | |
| "loss": 0.4081, | |
| "mean_token_accuracy": 0.8594057202339173, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.1234396671289875, | |
| "grad_norm": 0.15663549304008484, | |
| "learning_rate": 3.996146435452794e-05, | |
| "loss": 0.41, | |
| "mean_token_accuracy": 0.8585174441337585, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.130374479889043, | |
| "grad_norm": 0.2926901578903198, | |
| "learning_rate": 3.986512524084778e-05, | |
| "loss": 0.4088, | |
| "mean_token_accuracy": 0.8590117692947388, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.1373092926490984, | |
| "grad_norm": 0.14440152049064636, | |
| "learning_rate": 3.976878612716764e-05, | |
| "loss": 0.4029, | |
| "mean_token_accuracy": 0.8609296917915344, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.1442441054091539, | |
| "grad_norm": 0.18435537815093994, | |
| "learning_rate": 3.967244701348748e-05, | |
| "loss": 0.406, | |
| "mean_token_accuracy": 0.8599510669708252, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.1511789181692094, | |
| "grad_norm": 0.16614344716072083, | |
| "learning_rate": 3.957610789980732e-05, | |
| "loss": 0.4165, | |
| "mean_token_accuracy": 0.8565038800239563, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.1581137309292648, | |
| "grad_norm": 0.180514857172966, | |
| "learning_rate": 3.947976878612717e-05, | |
| "loss": 0.4233, | |
| "mean_token_accuracy": 0.854881489276886, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.1650485436893203, | |
| "grad_norm": 0.17873796820640564, | |
| "learning_rate": 3.9383429672447015e-05, | |
| "loss": 0.4142, | |
| "mean_token_accuracy": 0.8575613379478455, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.1719833564493758, | |
| "grad_norm": 0.17171607911586761, | |
| "learning_rate": 3.928709055876686e-05, | |
| "loss": 0.4199, | |
| "mean_token_accuracy": 0.8555831432342529, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.1789181692094313, | |
| "grad_norm": 0.2052180916070938, | |
| "learning_rate": 3.9190751445086707e-05, | |
| "loss": 0.4084, | |
| "mean_token_accuracy": 0.8590785980224609, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.1858529819694867, | |
| "grad_norm": 0.16104774177074432, | |
| "learning_rate": 3.9094412331406556e-05, | |
| "loss": 0.4064, | |
| "mean_token_accuracy": 0.8598842978477478, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.1927877947295422, | |
| "grad_norm": 0.16743043065071106, | |
| "learning_rate": 3.89980732177264e-05, | |
| "loss": 0.4136, | |
| "mean_token_accuracy": 0.857661247253418, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.1997226074895977, | |
| "grad_norm": 0.15085460245609283, | |
| "learning_rate": 3.890173410404625e-05, | |
| "loss": 0.4143, | |
| "mean_token_accuracy": 0.8573906064033509, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.2066574202496532, | |
| "grad_norm": 0.14390355348587036, | |
| "learning_rate": 3.880539499036609e-05, | |
| "loss": 0.4221, | |
| "mean_token_accuracy": 0.8549758553504944, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.2135922330097086, | |
| "grad_norm": 0.170955091714859, | |
| "learning_rate": 3.870905587668593e-05, | |
| "loss": 0.417, | |
| "mean_token_accuracy": 0.8564952373504638, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.2205270457697641, | |
| "grad_norm": 0.17432747781276703, | |
| "learning_rate": 3.861271676300578e-05, | |
| "loss": 0.4065, | |
| "mean_token_accuracy": 0.8594078302383423, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.2274618585298196, | |
| "grad_norm": 0.15886807441711426, | |
| "learning_rate": 3.851637764932563e-05, | |
| "loss": 0.4191, | |
| "mean_token_accuracy": 0.8560209155082703, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.234396671289875, | |
| "grad_norm": 0.22739961743354797, | |
| "learning_rate": 3.8420038535645474e-05, | |
| "loss": 0.4136, | |
| "mean_token_accuracy": 0.8580429792404175, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.2413314840499305, | |
| "grad_norm": 0.1761876940727234, | |
| "learning_rate": 3.832369942196532e-05, | |
| "loss": 0.4058, | |
| "mean_token_accuracy": 0.8598258137702942, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.248266296809986, | |
| "grad_norm": 0.8043875098228455, | |
| "learning_rate": 3.8227360308285166e-05, | |
| "loss": 0.4146, | |
| "mean_token_accuracy": 0.8571544885635376, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.2552011095700415, | |
| "grad_norm": 0.18043817579746246, | |
| "learning_rate": 3.813102119460501e-05, | |
| "loss": 0.4032, | |
| "mean_token_accuracy": 0.8605299115180969, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.262135922330097, | |
| "grad_norm": 0.16484476625919342, | |
| "learning_rate": 3.803468208092486e-05, | |
| "loss": 0.3987, | |
| "mean_token_accuracy": 0.8620869636535644, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.2690707350901524, | |
| "grad_norm": 0.15530748665332794, | |
| "learning_rate": 3.793834296724471e-05, | |
| "loss": 0.4019, | |
| "mean_token_accuracy": 0.8608452916145325, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.276005547850208, | |
| "grad_norm": 0.16284696757793427, | |
| "learning_rate": 3.784200385356455e-05, | |
| "loss": 0.4056, | |
| "mean_token_accuracy": 0.8598546504974365, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.2829403606102634, | |
| "grad_norm": 0.15156075358390808, | |
| "learning_rate": 3.774566473988439e-05, | |
| "loss": 0.4148, | |
| "mean_token_accuracy": 0.8573850750923157, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.2898751733703189, | |
| "grad_norm": 0.18044961988925934, | |
| "learning_rate": 3.764932562620424e-05, | |
| "loss": 0.4165, | |
| "mean_token_accuracy": 0.8561815977096557, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.2968099861303743, | |
| "grad_norm": 0.1658436506986618, | |
| "learning_rate": 3.7552986512524084e-05, | |
| "loss": 0.4056, | |
| "mean_token_accuracy": 0.8600709080696106, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.3037447988904298, | |
| "grad_norm": 0.16520382463932037, | |
| "learning_rate": 3.7456647398843934e-05, | |
| "loss": 0.4148, | |
| "mean_token_accuracy": 0.8568984508514405, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.3106796116504853, | |
| "grad_norm": 0.1799880713224411, | |
| "learning_rate": 3.736030828516378e-05, | |
| "loss": 0.4188, | |
| "mean_token_accuracy": 0.856162166595459, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.317614424410541, | |
| "grad_norm": 0.16812920570373535, | |
| "learning_rate": 3.7263969171483626e-05, | |
| "loss": 0.4111, | |
| "mean_token_accuracy": 0.8584610104560852, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.3245492371705965, | |
| "grad_norm": 0.15165302157402039, | |
| "learning_rate": 3.716763005780347e-05, | |
| "loss": 0.404, | |
| "mean_token_accuracy": 0.860293960571289, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.331484049930652, | |
| "grad_norm": 0.13624367117881775, | |
| "learning_rate": 3.707129094412332e-05, | |
| "loss": 0.4136, | |
| "mean_token_accuracy": 0.8572997689247132, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.3384188626907074, | |
| "grad_norm": 0.6036350131034851, | |
| "learning_rate": 3.697495183044316e-05, | |
| "loss": 0.4139, | |
| "mean_token_accuracy": 0.8579724669456482, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.345353675450763, | |
| "grad_norm": 0.16172119975090027, | |
| "learning_rate": 3.6878612716763e-05, | |
| "loss": 0.4194, | |
| "mean_token_accuracy": 0.8556413412094116, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.3522884882108184, | |
| "grad_norm": 0.13519282639026642, | |
| "learning_rate": 3.678227360308285e-05, | |
| "loss": 0.4053, | |
| "mean_token_accuracy": 0.8597145199775695, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.3592233009708738, | |
| "grad_norm": 0.14305779337882996, | |
| "learning_rate": 3.66859344894027e-05, | |
| "loss": 0.4066, | |
| "mean_token_accuracy": 0.8596665501594544, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.3661581137309293, | |
| "grad_norm": 0.18043436110019684, | |
| "learning_rate": 3.6589595375722544e-05, | |
| "loss": 0.4061, | |
| "mean_token_accuracy": 0.8598026871681214, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.3730929264909848, | |
| "grad_norm": 0.12696458399295807, | |
| "learning_rate": 3.649325626204239e-05, | |
| "loss": 0.4035, | |
| "mean_token_accuracy": 0.860295832157135, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.3800277392510403, | |
| "grad_norm": 0.15299014747142792, | |
| "learning_rate": 3.6396917148362236e-05, | |
| "loss": 0.4045, | |
| "mean_token_accuracy": 0.8601055145263672, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.3869625520110958, | |
| "grad_norm": 0.14797343313694, | |
| "learning_rate": 3.630057803468208e-05, | |
| "loss": 0.4064, | |
| "mean_token_accuracy": 0.8599012613296508, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2884, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2953357891076096e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |