{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1953, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 8.421893119812012, "eval_runtime": 290.5835, "eval_samples_per_second": 217.989, "eval_steps_per_second": 1.948, "step": 0 }, { "epoch": 0.0005120327700972862, "grad_norm": 29.11949348449707, "learning_rate": 0.0, "loss": 5.5689, "step": 1 }, { "epoch": 0.0010240655401945725, "grad_norm": 45.6514778137207, "learning_rate": 1.7094017094017095e-07, "loss": 5.7038, "step": 2 }, { "epoch": 0.0015360983102918587, "grad_norm": 38.365718841552734, "learning_rate": 3.418803418803419e-07, "loss": 5.8605, "step": 3 }, { "epoch": 0.002048131080389145, "grad_norm": 24.351367950439453, "learning_rate": 5.128205128205128e-07, "loss": 5.0712, "step": 4 }, { "epoch": 0.002560163850486431, "grad_norm": 44.47206497192383, "learning_rate": 6.837606837606838e-07, "loss": 5.4321, "step": 5 }, { "epoch": 0.0030721966205837174, "grad_norm": 28.493654251098633, "learning_rate": 8.547008547008548e-07, "loss": 5.5277, "step": 6 }, { "epoch": 0.0035842293906810036, "grad_norm": 35.8974609375, "learning_rate": 1.0256410256410257e-06, "loss": 5.85, "step": 7 }, { "epoch": 0.00409626216077829, "grad_norm": 28.743633270263672, "learning_rate": 1.1965811965811968e-06, "loss": 5.6127, "step": 8 }, { "epoch": 0.004608294930875576, "grad_norm": 30.60209083557129, "learning_rate": 1.3675213675213676e-06, "loss": 5.3472, "step": 9 }, { "epoch": 0.005120327700972862, "grad_norm": 52.63827896118164, "learning_rate": 1.5384615384615387e-06, "loss": 5.3988, "step": 10 }, { "epoch": 0.0056323604710701485, "grad_norm": 23.665441513061523, "learning_rate": 1.7094017094017097e-06, "loss": 5.211, "step": 11 }, { "epoch": 0.006144393241167435, "grad_norm": 24.975431442260742, "learning_rate": 1.8803418803418804e-06, "loss": 5.186, "step": 12 }, { "epoch": 0.006656426011264721, "grad_norm": 15.487852096557617, "learning_rate": 2.0512820512820513e-06, "loss": 4.6811, "step": 13 }, { "epoch": 0.007168458781362007, "grad_norm": 13.852325439453125, "learning_rate": 2.2222222222222225e-06, "loss": 4.7137, "step": 14 }, { "epoch": 0.007680491551459293, "grad_norm": 13.090566635131836, "learning_rate": 2.3931623931623937e-06, "loss": 4.7662, "step": 15 }, { "epoch": 0.00819252432155658, "grad_norm": 9.755819320678711, "learning_rate": 2.564102564102564e-06, "loss": 4.4373, "step": 16 }, { "epoch": 0.008704557091653867, "grad_norm": 6.100168228149414, "learning_rate": 2.735042735042735e-06, "loss": 4.1901, "step": 17 }, { "epoch": 0.009216589861751152, "grad_norm": 5.699375629425049, "learning_rate": 2.9059829059829063e-06, "loss": 4.3055, "step": 18 }, { "epoch": 0.00972862263184844, "grad_norm": 3.9487569332122803, "learning_rate": 3.0769230769230774e-06, "loss": 4.0598, "step": 19 }, { "epoch": 0.010240655401945725, "grad_norm": 3.4959120750427246, "learning_rate": 3.247863247863248e-06, "loss": 4.1284, "step": 20 }, { "epoch": 0.010752688172043012, "grad_norm": 3.13635516166687, "learning_rate": 3.4188034188034193e-06, "loss": 4.1187, "step": 21 }, { "epoch": 0.011264720942140297, "grad_norm": 2.6086392402648926, "learning_rate": 3.5897435897435896e-06, "loss": 4.0442, "step": 22 }, { "epoch": 0.011776753712237584, "grad_norm": 2.4481008052825928, "learning_rate": 3.760683760683761e-06, "loss": 3.9187, "step": 23 }, { "epoch": 0.01228878648233487, "grad_norm": 2.2170064449310303, "learning_rate": 3.9316239316239315e-06, "loss": 3.876, "step": 24 }, { "epoch": 0.012800819252432157, "grad_norm": 2.1951825618743896, "learning_rate": 4.102564102564103e-06, "loss": 3.7054, "step": 25 }, { "epoch": 0.013312852022529442, "grad_norm": 2.148066520690918, "learning_rate": 4.273504273504274e-06, "loss": 3.9288, "step": 26 }, { "epoch": 0.013824884792626729, "grad_norm": 1.9754362106323242, "learning_rate": 4.444444444444445e-06, "loss": 3.7277, "step": 27 }, { "epoch": 0.014336917562724014, "grad_norm": 1.882916808128357, "learning_rate": 4.615384615384616e-06, "loss": 3.7497, "step": 28 }, { "epoch": 0.014848950332821301, "grad_norm": 1.7356491088867188, "learning_rate": 4.786324786324787e-06, "loss": 3.6844, "step": 29 }, { "epoch": 0.015360983102918587, "grad_norm": 1.6454310417175293, "learning_rate": 4.957264957264958e-06, "loss": 3.7156, "step": 30 }, { "epoch": 0.015873015873015872, "grad_norm": 1.4495666027069092, "learning_rate": 5.128205128205128e-06, "loss": 3.4571, "step": 31 }, { "epoch": 0.01638504864311316, "grad_norm": 1.3990592956542969, "learning_rate": 5.299145299145299e-06, "loss": 3.4633, "step": 32 }, { "epoch": 0.016897081413210446, "grad_norm": 1.3835159540176392, "learning_rate": 5.47008547008547e-06, "loss": 3.4069, "step": 33 }, { "epoch": 0.017409114183307733, "grad_norm": 1.1848242282867432, "learning_rate": 5.641025641025641e-06, "loss": 3.4285, "step": 34 }, { "epoch": 0.017921146953405017, "grad_norm": 1.1986013650894165, "learning_rate": 5.8119658119658126e-06, "loss": 3.3033, "step": 35 }, { "epoch": 0.018433179723502304, "grad_norm": 1.1884499788284302, "learning_rate": 5.982905982905984e-06, "loss": 3.4241, "step": 36 }, { "epoch": 0.01894521249359959, "grad_norm": 1.1940100193023682, "learning_rate": 6.153846153846155e-06, "loss": 3.3591, "step": 37 }, { "epoch": 0.01945724526369688, "grad_norm": 1.2502583265304565, "learning_rate": 6.324786324786325e-06, "loss": 3.3602, "step": 38 }, { "epoch": 0.019969278033794162, "grad_norm": 1.1729472875595093, "learning_rate": 6.495726495726496e-06, "loss": 3.1424, "step": 39 }, { "epoch": 0.02048131080389145, "grad_norm": 1.3596287965774536, "learning_rate": 6.666666666666667e-06, "loss": 3.3085, "step": 40 }, { "epoch": 0.020993343573988736, "grad_norm": 2.2369184494018555, "learning_rate": 6.837606837606839e-06, "loss": 3.161, "step": 41 }, { "epoch": 0.021505376344086023, "grad_norm": 1.1120164394378662, "learning_rate": 7.008547008547009e-06, "loss": 3.2252, "step": 42 }, { "epoch": 0.022017409114183307, "grad_norm": 0.9228061437606812, "learning_rate": 7.179487179487179e-06, "loss": 2.8847, "step": 43 }, { "epoch": 0.022529441884280594, "grad_norm": 1.064693570137024, "learning_rate": 7.350427350427351e-06, "loss": 2.9551, "step": 44 }, { "epoch": 0.02304147465437788, "grad_norm": 2.0467774868011475, "learning_rate": 7.521367521367522e-06, "loss": 2.9202, "step": 45 }, { "epoch": 0.023553507424475168, "grad_norm": 1.1722445487976074, "learning_rate": 7.692307692307694e-06, "loss": 2.9474, "step": 46 }, { "epoch": 0.024065540194572452, "grad_norm": 1.1221846342086792, "learning_rate": 7.863247863247863e-06, "loss": 2.8372, "step": 47 }, { "epoch": 0.02457757296466974, "grad_norm": 1.2214652299880981, "learning_rate": 8.034188034188036e-06, "loss": 2.9515, "step": 48 }, { "epoch": 0.025089605734767026, "grad_norm": 0.8685446977615356, "learning_rate": 8.205128205128205e-06, "loss": 2.6877, "step": 49 }, { "epoch": 0.025601638504864313, "grad_norm": 1.282287359237671, "learning_rate": 8.376068376068377e-06, "loss": 2.8079, "step": 50 }, { "epoch": 0.026113671274961597, "grad_norm": 1.225782036781311, "learning_rate": 8.547008547008548e-06, "loss": 2.7309, "step": 51 }, { "epoch": 0.026625704045058884, "grad_norm": 0.8075404763221741, "learning_rate": 8.717948717948717e-06, "loss": 2.6013, "step": 52 }, { "epoch": 0.02713773681515617, "grad_norm": 0.7983790636062622, "learning_rate": 8.88888888888889e-06, "loss": 2.5489, "step": 53 }, { "epoch": 0.027649769585253458, "grad_norm": 1.5385791063308716, "learning_rate": 9.05982905982906e-06, "loss": 2.7261, "step": 54 }, { "epoch": 0.02816180235535074, "grad_norm": 0.796707034111023, "learning_rate": 9.230769230769232e-06, "loss": 2.6259, "step": 55 }, { "epoch": 0.02867383512544803, "grad_norm": 0.7048851847648621, "learning_rate": 9.401709401709402e-06, "loss": 2.5378, "step": 56 }, { "epoch": 0.029185867895545316, "grad_norm": 0.714394748210907, "learning_rate": 9.572649572649575e-06, "loss": 2.4427, "step": 57 }, { "epoch": 0.029697900665642603, "grad_norm": 0.7058516144752502, "learning_rate": 9.743589743589744e-06, "loss": 2.3323, "step": 58 }, { "epoch": 0.030209933435739886, "grad_norm": 0.7270988821983337, "learning_rate": 9.914529914529915e-06, "loss": 2.5307, "step": 59 }, { "epoch": 0.030721966205837174, "grad_norm": 0.7972428798675537, "learning_rate": 1.0085470085470086e-05, "loss": 2.4313, "step": 60 }, { "epoch": 0.03123399897593446, "grad_norm": 0.7130435705184937, "learning_rate": 1.0256410256410256e-05, "loss": 2.392, "step": 61 }, { "epoch": 0.031746031746031744, "grad_norm": 0.7569980621337891, "learning_rate": 1.0427350427350429e-05, "loss": 2.3655, "step": 62 }, { "epoch": 0.03225806451612903, "grad_norm": 0.729451060295105, "learning_rate": 1.0598290598290598e-05, "loss": 2.1844, "step": 63 }, { "epoch": 0.03277009728622632, "grad_norm": 0.8262594938278198, "learning_rate": 1.0769230769230771e-05, "loss": 2.2465, "step": 64 }, { "epoch": 0.033282130056323606, "grad_norm": 2.0017244815826416, "learning_rate": 1.094017094017094e-05, "loss": 2.136, "step": 65 }, { "epoch": 0.03379416282642089, "grad_norm": 0.9036684036254883, "learning_rate": 1.1111111111111112e-05, "loss": 2.1621, "step": 66 }, { "epoch": 0.03430619559651818, "grad_norm": 0.8627972602844238, "learning_rate": 1.1282051282051283e-05, "loss": 2.0473, "step": 67 }, { "epoch": 0.03481822836661547, "grad_norm": 0.999786376953125, "learning_rate": 1.1452991452991452e-05, "loss": 2.0141, "step": 68 }, { "epoch": 0.03533026113671275, "grad_norm": 0.8584579229354858, "learning_rate": 1.1623931623931625e-05, "loss": 1.9143, "step": 69 }, { "epoch": 0.035842293906810034, "grad_norm": 0.9274497628211975, "learning_rate": 1.1794871794871795e-05, "loss": 1.9352, "step": 70 }, { "epoch": 0.03635432667690732, "grad_norm": 0.9018367528915405, "learning_rate": 1.1965811965811967e-05, "loss": 1.836, "step": 71 }, { "epoch": 0.03686635944700461, "grad_norm": 1.1171284914016724, "learning_rate": 1.2136752136752137e-05, "loss": 1.7087, "step": 72 }, { "epoch": 0.037378392217101895, "grad_norm": 0.8913128972053528, "learning_rate": 1.230769230769231e-05, "loss": 1.7192, "step": 73 }, { "epoch": 0.03789042498719918, "grad_norm": 0.8860878944396973, "learning_rate": 1.247863247863248e-05, "loss": 1.6841, "step": 74 }, { "epoch": 0.03840245775729647, "grad_norm": 0.8566821217536926, "learning_rate": 1.264957264957265e-05, "loss": 1.6558, "step": 75 }, { "epoch": 0.03891449052739376, "grad_norm": 0.6862190365791321, "learning_rate": 1.282051282051282e-05, "loss": 1.5026, "step": 76 }, { "epoch": 0.03942652329749104, "grad_norm": 0.6934076547622681, "learning_rate": 1.2991452991452993e-05, "loss": 1.5462, "step": 77 }, { "epoch": 0.039938556067588324, "grad_norm": 0.6947596073150635, "learning_rate": 1.3162393162393164e-05, "loss": 1.4743, "step": 78 }, { "epoch": 0.04045058883768561, "grad_norm": 0.6666589379310608, "learning_rate": 1.3333333333333333e-05, "loss": 1.4511, "step": 79 }, { "epoch": 0.0409626216077829, "grad_norm": 0.5681557059288025, "learning_rate": 1.3504273504273504e-05, "loss": 1.298, "step": 80 }, { "epoch": 0.041474654377880185, "grad_norm": 0.5425779819488525, "learning_rate": 1.3675213675213677e-05, "loss": 1.3321, "step": 81 }, { "epoch": 0.04198668714797747, "grad_norm": 0.549839973449707, "learning_rate": 1.3846153846153847e-05, "loss": 1.2996, "step": 82 }, { "epoch": 0.04249871991807476, "grad_norm": 0.48769548535346985, "learning_rate": 1.4017094017094018e-05, "loss": 1.2947, "step": 83 }, { "epoch": 0.043010752688172046, "grad_norm": 0.45408493280410767, "learning_rate": 1.4188034188034189e-05, "loss": 1.231, "step": 84 }, { "epoch": 0.04352278545826933, "grad_norm": 0.4503278136253357, "learning_rate": 1.4358974358974359e-05, "loss": 1.2362, "step": 85 }, { "epoch": 0.044034818228366614, "grad_norm": 0.41161084175109863, "learning_rate": 1.4529914529914531e-05, "loss": 1.198, "step": 86 }, { "epoch": 0.0445468509984639, "grad_norm": 0.3590110242366791, "learning_rate": 1.4700854700854703e-05, "loss": 1.1563, "step": 87 }, { "epoch": 0.04505888376856119, "grad_norm": 0.3896704912185669, "learning_rate": 1.4871794871794872e-05, "loss": 1.2062, "step": 88 }, { "epoch": 0.045570916538658475, "grad_norm": 0.3641629219055176, "learning_rate": 1.5042735042735043e-05, "loss": 1.1517, "step": 89 }, { "epoch": 0.04608294930875576, "grad_norm": 0.34741657972335815, "learning_rate": 1.5213675213675213e-05, "loss": 1.1491, "step": 90 }, { "epoch": 0.04659498207885305, "grad_norm": 0.28750649094581604, "learning_rate": 1.5384615384615387e-05, "loss": 1.079, "step": 91 }, { "epoch": 0.047107014848950336, "grad_norm": 0.30585750937461853, "learning_rate": 1.5555555555555555e-05, "loss": 1.0574, "step": 92 }, { "epoch": 0.047619047619047616, "grad_norm": 0.2975628972053528, "learning_rate": 1.5726495726495726e-05, "loss": 1.06, "step": 93 }, { "epoch": 0.048131080389144903, "grad_norm": 0.303818017244339, "learning_rate": 1.5897435897435897e-05, "loss": 1.0691, "step": 94 }, { "epoch": 0.04864311315924219, "grad_norm": 0.2619573175907135, "learning_rate": 1.6068376068376072e-05, "loss": 1.0216, "step": 95 }, { "epoch": 0.04915514592933948, "grad_norm": 0.3044736683368683, "learning_rate": 1.623931623931624e-05, "loss": 1.04, "step": 96 }, { "epoch": 0.049667178699436765, "grad_norm": 0.2504830062389374, "learning_rate": 1.641025641025641e-05, "loss": 1.0046, "step": 97 }, { "epoch": 0.05017921146953405, "grad_norm": 0.25186970829963684, "learning_rate": 1.6581196581196582e-05, "loss": 1.0207, "step": 98 }, { "epoch": 0.05069124423963134, "grad_norm": 0.2326544225215912, "learning_rate": 1.6752136752136753e-05, "loss": 0.9673, "step": 99 }, { "epoch": 0.051203277009728626, "grad_norm": 0.244401216506958, "learning_rate": 1.6923076923076924e-05, "loss": 0.9912, "step": 100 }, { "epoch": 0.051715309779825906, "grad_norm": 0.20759882032871246, "learning_rate": 1.7094017094017095e-05, "loss": 0.9913, "step": 101 }, { "epoch": 0.05222734254992319, "grad_norm": 0.1958427131175995, "learning_rate": 1.7264957264957267e-05, "loss": 0.9601, "step": 102 }, { "epoch": 0.05273937532002048, "grad_norm": 0.2322046309709549, "learning_rate": 1.7435897435897434e-05, "loss": 0.9984, "step": 103 }, { "epoch": 0.05325140809011777, "grad_norm": 0.20548316836357117, "learning_rate": 1.760683760683761e-05, "loss": 0.9418, "step": 104 }, { "epoch": 0.053763440860215055, "grad_norm": 0.1974097639322281, "learning_rate": 1.777777777777778e-05, "loss": 0.9524, "step": 105 }, { "epoch": 0.05427547363031234, "grad_norm": 0.1943182796239853, "learning_rate": 1.794871794871795e-05, "loss": 0.9222, "step": 106 }, { "epoch": 0.05478750640040963, "grad_norm": 0.20405244827270508, "learning_rate": 1.811965811965812e-05, "loss": 0.9325, "step": 107 }, { "epoch": 0.055299539170506916, "grad_norm": 0.2744787931442261, "learning_rate": 1.829059829059829e-05, "loss": 0.9156, "step": 108 }, { "epoch": 0.055811571940604196, "grad_norm": 0.20778785645961761, "learning_rate": 1.8461538461538465e-05, "loss": 0.8974, "step": 109 }, { "epoch": 0.05632360471070148, "grad_norm": 0.20918428897857666, "learning_rate": 1.8632478632478632e-05, "loss": 0.9143, "step": 110 }, { "epoch": 0.05683563748079877, "grad_norm": 0.19996380805969238, "learning_rate": 1.8803418803418804e-05, "loss": 0.9319, "step": 111 }, { "epoch": 0.05734767025089606, "grad_norm": 0.16424617171287537, "learning_rate": 1.8974358974358975e-05, "loss": 0.9076, "step": 112 }, { "epoch": 0.057859703020993344, "grad_norm": 0.17009302973747253, "learning_rate": 1.914529914529915e-05, "loss": 0.9007, "step": 113 }, { "epoch": 0.05837173579109063, "grad_norm": 0.16458144783973694, "learning_rate": 1.9316239316239317e-05, "loss": 0.8997, "step": 114 }, { "epoch": 0.05888376856118792, "grad_norm": 0.19761289656162262, "learning_rate": 1.9487179487179488e-05, "loss": 0.8643, "step": 115 }, { "epoch": 0.059395801331285206, "grad_norm": 0.16355127096176147, "learning_rate": 1.965811965811966e-05, "loss": 0.8853, "step": 116 }, { "epoch": 0.059907834101382486, "grad_norm": 0.14890463650226593, "learning_rate": 1.982905982905983e-05, "loss": 0.8763, "step": 117 }, { "epoch": 0.06041986687147977, "grad_norm": 0.15002556145191193, "learning_rate": 2e-05, "loss": 0.8891, "step": 118 }, { "epoch": 0.06093189964157706, "grad_norm": 0.14429616928100586, "learning_rate": 2.0170940170940173e-05, "loss": 0.8498, "step": 119 }, { "epoch": 0.06144393241167435, "grad_norm": 0.15870024263858795, "learning_rate": 2.0341880341880344e-05, "loss": 0.8474, "step": 120 }, { "epoch": 0.061955965181771634, "grad_norm": 0.1399078369140625, "learning_rate": 2.0512820512820512e-05, "loss": 0.8502, "step": 121 }, { "epoch": 0.06246799795186892, "grad_norm": 0.12390255182981491, "learning_rate": 2.0683760683760683e-05, "loss": 0.8308, "step": 122 }, { "epoch": 0.0629800307219662, "grad_norm": 0.13995809853076935, "learning_rate": 2.0854700854700857e-05, "loss": 0.8376, "step": 123 }, { "epoch": 0.06349206349206349, "grad_norm": 0.1324658989906311, "learning_rate": 2.102564102564103e-05, "loss": 0.8578, "step": 124 }, { "epoch": 0.06400409626216078, "grad_norm": 0.12866894900798798, "learning_rate": 2.1196581196581196e-05, "loss": 0.826, "step": 125 }, { "epoch": 0.06451612903225806, "grad_norm": 0.12249317765235901, "learning_rate": 2.1367521367521368e-05, "loss": 0.8304, "step": 126 }, { "epoch": 0.06502816180235535, "grad_norm": 0.1243545264005661, "learning_rate": 2.1538461538461542e-05, "loss": 0.7915, "step": 127 }, { "epoch": 0.06554019457245264, "grad_norm": 0.14660273492336273, "learning_rate": 2.170940170940171e-05, "loss": 0.8588, "step": 128 }, { "epoch": 0.06605222734254992, "grad_norm": 0.1372343897819519, "learning_rate": 2.188034188034188e-05, "loss": 0.8321, "step": 129 }, { "epoch": 0.06656426011264721, "grad_norm": 0.12708920240402222, "learning_rate": 2.2051282051282052e-05, "loss": 0.8316, "step": 130 }, { "epoch": 0.0670762928827445, "grad_norm": 0.224148690700531, "learning_rate": 2.2222222222222223e-05, "loss": 0.7801, "step": 131 }, { "epoch": 0.06758832565284179, "grad_norm": 0.1272125095129013, "learning_rate": 2.2393162393162394e-05, "loss": 0.8344, "step": 132 }, { "epoch": 0.06810035842293907, "grad_norm": 0.1217728778719902, "learning_rate": 2.2564102564102566e-05, "loss": 0.7958, "step": 133 }, { "epoch": 0.06861239119303636, "grad_norm": 0.14137236773967743, "learning_rate": 2.2735042735042737e-05, "loss": 0.7911, "step": 134 }, { "epoch": 0.06912442396313365, "grad_norm": 0.1261357069015503, "learning_rate": 2.2905982905982905e-05, "loss": 0.8221, "step": 135 }, { "epoch": 0.06963645673323093, "grad_norm": 0.15151292085647583, "learning_rate": 2.307692307692308e-05, "loss": 0.8068, "step": 136 }, { "epoch": 0.0701484895033282, "grad_norm": 0.13383713364601135, "learning_rate": 2.324786324786325e-05, "loss": 0.8096, "step": 137 }, { "epoch": 0.0706605222734255, "grad_norm": 0.10953599959611893, "learning_rate": 2.341880341880342e-05, "loss": 0.8092, "step": 138 }, { "epoch": 0.07117255504352278, "grad_norm": 0.11135266721248627, "learning_rate": 2.358974358974359e-05, "loss": 0.7954, "step": 139 }, { "epoch": 0.07168458781362007, "grad_norm": 0.12209566682577133, "learning_rate": 2.376068376068376e-05, "loss": 0.8293, "step": 140 }, { "epoch": 0.07219662058371736, "grad_norm": 0.11371300369501114, "learning_rate": 2.3931623931623935e-05, "loss": 0.8076, "step": 141 }, { "epoch": 0.07270865335381464, "grad_norm": 0.12728151679039001, "learning_rate": 2.4102564102564103e-05, "loss": 0.827, "step": 142 }, { "epoch": 0.07322068612391193, "grad_norm": 0.12316320091485977, "learning_rate": 2.4273504273504274e-05, "loss": 0.7854, "step": 143 }, { "epoch": 0.07373271889400922, "grad_norm": 0.10929558426141739, "learning_rate": 2.4444444444444445e-05, "loss": 0.7864, "step": 144 }, { "epoch": 0.0742447516641065, "grad_norm": 0.10534000396728516, "learning_rate": 2.461538461538462e-05, "loss": 0.7946, "step": 145 }, { "epoch": 0.07475678443420379, "grad_norm": 0.13529212772846222, "learning_rate": 2.4786324786324787e-05, "loss": 0.7963, "step": 146 }, { "epoch": 0.07526881720430108, "grad_norm": 0.11045701801776886, "learning_rate": 2.495726495726496e-05, "loss": 0.7902, "step": 147 }, { "epoch": 0.07578084997439836, "grad_norm": 0.11584638804197311, "learning_rate": 2.512820512820513e-05, "loss": 0.7707, "step": 148 }, { "epoch": 0.07629288274449565, "grad_norm": 0.10732859373092651, "learning_rate": 2.52991452991453e-05, "loss": 0.7657, "step": 149 }, { "epoch": 0.07680491551459294, "grad_norm": 0.11085093766450882, "learning_rate": 2.547008547008547e-05, "loss": 0.7792, "step": 150 }, { "epoch": 0.07731694828469023, "grad_norm": 0.13008089363574982, "learning_rate": 2.564102564102564e-05, "loss": 0.7945, "step": 151 }, { "epoch": 0.07782898105478751, "grad_norm": 0.1461058259010315, "learning_rate": 2.5811965811965814e-05, "loss": 0.7758, "step": 152 }, { "epoch": 0.07834101382488479, "grad_norm": 0.11290822923183441, "learning_rate": 2.5982905982905985e-05, "loss": 0.7834, "step": 153 }, { "epoch": 0.07885304659498207, "grad_norm": 0.11300565302371979, "learning_rate": 2.6153846153846157e-05, "loss": 0.805, "step": 154 }, { "epoch": 0.07936507936507936, "grad_norm": 0.12110847234725952, "learning_rate": 2.6324786324786328e-05, "loss": 0.7764, "step": 155 }, { "epoch": 0.07987711213517665, "grad_norm": 0.10356455296278, "learning_rate": 2.64957264957265e-05, "loss": 0.7484, "step": 156 }, { "epoch": 0.08038914490527393, "grad_norm": 0.14653243124485016, "learning_rate": 2.6666666666666667e-05, "loss": 0.7653, "step": 157 }, { "epoch": 0.08090117767537122, "grad_norm": 0.14659158885478973, "learning_rate": 2.6837606837606838e-05, "loss": 0.764, "step": 158 }, { "epoch": 0.08141321044546851, "grad_norm": 0.11249863356351852, "learning_rate": 2.700854700854701e-05, "loss": 0.7587, "step": 159 }, { "epoch": 0.0819252432155658, "grad_norm": 0.11798933893442154, "learning_rate": 2.717948717948718e-05, "loss": 0.7681, "step": 160 }, { "epoch": 0.08243727598566308, "grad_norm": 0.10725108534097672, "learning_rate": 2.7350427350427355e-05, "loss": 0.7732, "step": 161 }, { "epoch": 0.08294930875576037, "grad_norm": 0.11182805895805359, "learning_rate": 2.7521367521367526e-05, "loss": 0.7684, "step": 162 }, { "epoch": 0.08346134152585766, "grad_norm": 0.11694719642400742, "learning_rate": 2.7692307692307694e-05, "loss": 0.805, "step": 163 }, { "epoch": 0.08397337429595494, "grad_norm": 0.14661794900894165, "learning_rate": 2.7863247863247865e-05, "loss": 0.7608, "step": 164 }, { "epoch": 0.08448540706605223, "grad_norm": 0.10859724879264832, "learning_rate": 2.8034188034188036e-05, "loss": 0.7554, "step": 165 }, { "epoch": 0.08499743983614952, "grad_norm": 0.11542957276105881, "learning_rate": 2.8205128205128207e-05, "loss": 0.7761, "step": 166 }, { "epoch": 0.0855094726062468, "grad_norm": 0.12732790410518646, "learning_rate": 2.8376068376068378e-05, "loss": 0.7719, "step": 167 }, { "epoch": 0.08602150537634409, "grad_norm": 0.10629823803901672, "learning_rate": 2.8547008547008546e-05, "loss": 0.7381, "step": 168 }, { "epoch": 0.08653353814644137, "grad_norm": 0.12237891554832458, "learning_rate": 2.8717948717948717e-05, "loss": 0.7901, "step": 169 }, { "epoch": 0.08704557091653865, "grad_norm": 0.10036913305521011, "learning_rate": 2.8888888888888888e-05, "loss": 0.7476, "step": 170 }, { "epoch": 0.08755760368663594, "grad_norm": 0.10591697692871094, "learning_rate": 2.9059829059829063e-05, "loss": 0.7549, "step": 171 }, { "epoch": 0.08806963645673323, "grad_norm": 0.1098855510354042, "learning_rate": 2.9230769230769234e-05, "loss": 0.788, "step": 172 }, { "epoch": 0.08858166922683051, "grad_norm": 0.10973232239484787, "learning_rate": 2.9401709401709405e-05, "loss": 0.7555, "step": 173 }, { "epoch": 0.0890937019969278, "grad_norm": 0.12243342399597168, "learning_rate": 2.9572649572649573e-05, "loss": 0.7647, "step": 174 }, { "epoch": 0.08960573476702509, "grad_norm": 0.11168920993804932, "learning_rate": 2.9743589743589744e-05, "loss": 0.7719, "step": 175 }, { "epoch": 0.09011776753712238, "grad_norm": 0.10491779446601868, "learning_rate": 2.9914529914529915e-05, "loss": 0.7518, "step": 176 }, { "epoch": 0.09062980030721966, "grad_norm": 0.10787490010261536, "learning_rate": 3.0085470085470086e-05, "loss": 0.7578, "step": 177 }, { "epoch": 0.09114183307731695, "grad_norm": 0.11039704084396362, "learning_rate": 3.0256410256410257e-05, "loss": 0.765, "step": 178 }, { "epoch": 0.09165386584741424, "grad_norm": 0.10192885249853134, "learning_rate": 3.0427350427350425e-05, "loss": 0.7323, "step": 179 }, { "epoch": 0.09216589861751152, "grad_norm": 0.172809436917305, "learning_rate": 3.05982905982906e-05, "loss": 0.7574, "step": 180 }, { "epoch": 0.09267793138760881, "grad_norm": 0.12139556556940079, "learning_rate": 3.0769230769230774e-05, "loss": 0.7357, "step": 181 }, { "epoch": 0.0931899641577061, "grad_norm": 0.3149567246437073, "learning_rate": 3.0940170940170946e-05, "loss": 0.7444, "step": 182 }, { "epoch": 0.09370199692780339, "grad_norm": 0.11409509181976318, "learning_rate": 3.111111111111111e-05, "loss": 0.7669, "step": 183 }, { "epoch": 0.09421402969790067, "grad_norm": 0.15661323070526123, "learning_rate": 3.128205128205128e-05, "loss": 0.756, "step": 184 }, { "epoch": 0.09472606246799795, "grad_norm": 0.1074863001704216, "learning_rate": 3.145299145299145e-05, "loss": 0.7372, "step": 185 }, { "epoch": 0.09523809523809523, "grad_norm": 0.3065205514431, "learning_rate": 3.162393162393162e-05, "loss": 0.7378, "step": 186 }, { "epoch": 0.09575012800819252, "grad_norm": 0.11278647184371948, "learning_rate": 3.1794871794871795e-05, "loss": 0.7279, "step": 187 }, { "epoch": 0.09626216077828981, "grad_norm": 0.13617998361587524, "learning_rate": 3.1965811965811966e-05, "loss": 0.7529, "step": 188 }, { "epoch": 0.0967741935483871, "grad_norm": 0.1188589408993721, "learning_rate": 3.2136752136752144e-05, "loss": 0.7736, "step": 189 }, { "epoch": 0.09728622631848438, "grad_norm": 0.11834564059972763, "learning_rate": 3.230769230769231e-05, "loss": 0.7342, "step": 190 }, { "epoch": 0.09779825908858167, "grad_norm": 0.11700379103422165, "learning_rate": 3.247863247863248e-05, "loss": 0.7294, "step": 191 }, { "epoch": 0.09831029185867896, "grad_norm": 0.12790656089782715, "learning_rate": 3.264957264957265e-05, "loss": 0.7463, "step": 192 }, { "epoch": 0.09882232462877624, "grad_norm": 0.10581191629171371, "learning_rate": 3.282051282051282e-05, "loss": 0.7337, "step": 193 }, { "epoch": 0.09933435739887353, "grad_norm": 0.1702089160680771, "learning_rate": 3.299145299145299e-05, "loss": 0.7305, "step": 194 }, { "epoch": 0.09984639016897082, "grad_norm": 0.18756937980651855, "learning_rate": 3.3162393162393164e-05, "loss": 0.7395, "step": 195 }, { "epoch": 0.1003584229390681, "grad_norm": 0.11946378648281097, "learning_rate": 3.3333333333333335e-05, "loss": 0.7437, "step": 196 }, { "epoch": 0.10087045570916539, "grad_norm": 0.11496727913618088, "learning_rate": 3.3504273504273506e-05, "loss": 0.7386, "step": 197 }, { "epoch": 0.10138248847926268, "grad_norm": 0.10862918198108673, "learning_rate": 3.367521367521368e-05, "loss": 0.7014, "step": 198 }, { "epoch": 0.10189452124935997, "grad_norm": 0.12413224577903748, "learning_rate": 3.384615384615385e-05, "loss": 0.7432, "step": 199 }, { "epoch": 0.10240655401945725, "grad_norm": 0.11936026066541672, "learning_rate": 3.401709401709402e-05, "loss": 0.7707, "step": 200 }, { "epoch": 0.10291858678955453, "grad_norm": 0.11068830639123917, "learning_rate": 3.418803418803419e-05, "loss": 0.7313, "step": 201 }, { "epoch": 0.10343061955965181, "grad_norm": 0.11828430742025375, "learning_rate": 3.435897435897436e-05, "loss": 0.7439, "step": 202 }, { "epoch": 0.1039426523297491, "grad_norm": 0.1323801428079605, "learning_rate": 3.452991452991453e-05, "loss": 0.7203, "step": 203 }, { "epoch": 0.10445468509984639, "grad_norm": 0.11267322301864624, "learning_rate": 3.4700854700854704e-05, "loss": 0.7231, "step": 204 }, { "epoch": 0.10496671786994367, "grad_norm": 0.12175678461790085, "learning_rate": 3.487179487179487e-05, "loss": 0.7323, "step": 205 }, { "epoch": 0.10547875064004096, "grad_norm": 0.12117466330528259, "learning_rate": 3.504273504273504e-05, "loss": 0.7337, "step": 206 }, { "epoch": 0.10599078341013825, "grad_norm": 0.15108397603034973, "learning_rate": 3.521367521367522e-05, "loss": 0.7526, "step": 207 }, { "epoch": 0.10650281618023553, "grad_norm": 0.1337517499923706, "learning_rate": 3.538461538461539e-05, "loss": 0.7112, "step": 208 }, { "epoch": 0.10701484895033282, "grad_norm": 0.1242760494351387, "learning_rate": 3.555555555555556e-05, "loss": 0.7337, "step": 209 }, { "epoch": 0.10752688172043011, "grad_norm": 0.12469953298568726, "learning_rate": 3.572649572649573e-05, "loss": 0.7363, "step": 210 }, { "epoch": 0.1080389144905274, "grad_norm": 0.12563076615333557, "learning_rate": 3.58974358974359e-05, "loss": 0.7328, "step": 211 }, { "epoch": 0.10855094726062468, "grad_norm": 0.11981146782636642, "learning_rate": 3.606837606837607e-05, "loss": 0.741, "step": 212 }, { "epoch": 0.10906298003072197, "grad_norm": 0.1365303099155426, "learning_rate": 3.623931623931624e-05, "loss": 0.7364, "step": 213 }, { "epoch": 0.10957501280081926, "grad_norm": 0.11297251284122467, "learning_rate": 3.641025641025641e-05, "loss": 0.711, "step": 214 }, { "epoch": 0.11008704557091654, "grad_norm": 0.2129683643579483, "learning_rate": 3.658119658119658e-05, "loss": 0.7386, "step": 215 }, { "epoch": 0.11059907834101383, "grad_norm": 0.13129866123199463, "learning_rate": 3.675213675213676e-05, "loss": 0.7109, "step": 216 }, { "epoch": 0.1111111111111111, "grad_norm": 0.31776461005210876, "learning_rate": 3.692307692307693e-05, "loss": 0.7509, "step": 217 }, { "epoch": 0.11162314388120839, "grad_norm": 0.13636350631713867, "learning_rate": 3.70940170940171e-05, "loss": 0.738, "step": 218 }, { "epoch": 0.11213517665130568, "grad_norm": 0.14461447298526764, "learning_rate": 3.7264957264957265e-05, "loss": 0.7217, "step": 219 }, { "epoch": 0.11264720942140297, "grad_norm": 0.1588062047958374, "learning_rate": 3.7435897435897436e-05, "loss": 0.7183, "step": 220 }, { "epoch": 0.11315924219150025, "grad_norm": 0.13373608887195587, "learning_rate": 3.760683760683761e-05, "loss": 0.7146, "step": 221 }, { "epoch": 0.11367127496159754, "grad_norm": 0.1468418538570404, "learning_rate": 3.777777777777778e-05, "loss": 0.7128, "step": 222 }, { "epoch": 0.11418330773169483, "grad_norm": 0.1539921760559082, "learning_rate": 3.794871794871795e-05, "loss": 0.7371, "step": 223 }, { "epoch": 0.11469534050179211, "grad_norm": 0.1349155306816101, "learning_rate": 3.811965811965812e-05, "loss": 0.7063, "step": 224 }, { "epoch": 0.1152073732718894, "grad_norm": 0.15524713695049286, "learning_rate": 3.82905982905983e-05, "loss": 0.7251, "step": 225 }, { "epoch": 0.11571940604198669, "grad_norm": 0.1849210560321808, "learning_rate": 3.846153846153846e-05, "loss": 0.7445, "step": 226 }, { "epoch": 0.11623143881208398, "grad_norm": 0.15272629261016846, "learning_rate": 3.8632478632478634e-05, "loss": 0.7188, "step": 227 }, { "epoch": 0.11674347158218126, "grad_norm": 0.1561354696750641, "learning_rate": 3.8803418803418805e-05, "loss": 0.7038, "step": 228 }, { "epoch": 0.11725550435227855, "grad_norm": 0.14715760946273804, "learning_rate": 3.8974358974358976e-05, "loss": 0.7328, "step": 229 }, { "epoch": 0.11776753712237584, "grad_norm": 0.1884879320859909, "learning_rate": 3.914529914529915e-05, "loss": 0.6927, "step": 230 }, { "epoch": 0.11827956989247312, "grad_norm": 0.13177721202373505, "learning_rate": 3.931623931623932e-05, "loss": 0.7125, "step": 231 }, { "epoch": 0.11879160266257041, "grad_norm": 0.18603788316249847, "learning_rate": 3.948717948717949e-05, "loss": 0.7292, "step": 232 }, { "epoch": 0.11930363543266768, "grad_norm": 0.19759860634803772, "learning_rate": 3.965811965811966e-05, "loss": 0.7363, "step": 233 }, { "epoch": 0.11981566820276497, "grad_norm": 0.1680307537317276, "learning_rate": 3.9829059829059825e-05, "loss": 0.689, "step": 234 }, { "epoch": 0.12032770097286226, "grad_norm": 0.16469095647335052, "learning_rate": 4e-05, "loss": 0.7313, "step": 235 }, { "epoch": 0.12083973374295955, "grad_norm": 0.21936285495758057, "learning_rate": 4.0170940170940174e-05, "loss": 0.7083, "step": 236 }, { "epoch": 0.12135176651305683, "grad_norm": 0.1634967178106308, "learning_rate": 4.0341880341880346e-05, "loss": 0.7359, "step": 237 }, { "epoch": 0.12186379928315412, "grad_norm": 0.18534508347511292, "learning_rate": 4.051282051282052e-05, "loss": 0.7134, "step": 238 }, { "epoch": 0.12237583205325141, "grad_norm": 0.18292881548404694, "learning_rate": 4.068376068376069e-05, "loss": 0.7036, "step": 239 }, { "epoch": 0.1228878648233487, "grad_norm": 0.1636057198047638, "learning_rate": 4.085470085470086e-05, "loss": 0.7356, "step": 240 }, { "epoch": 0.12339989759344598, "grad_norm": 0.16275708377361298, "learning_rate": 4.1025641025641023e-05, "loss": 0.7182, "step": 241 }, { "epoch": 0.12391193036354327, "grad_norm": 0.15663281083106995, "learning_rate": 4.1196581196581195e-05, "loss": 0.7008, "step": 242 }, { "epoch": 0.12442396313364056, "grad_norm": 0.14787471294403076, "learning_rate": 4.1367521367521366e-05, "loss": 0.7338, "step": 243 }, { "epoch": 0.12493599590373784, "grad_norm": 0.16425377130508423, "learning_rate": 4.1538461538461544e-05, "loss": 0.6967, "step": 244 }, { "epoch": 0.12544802867383512, "grad_norm": 0.19695420563220978, "learning_rate": 4.1709401709401715e-05, "loss": 0.7228, "step": 245 }, { "epoch": 0.1259600614439324, "grad_norm": 0.20708788931369781, "learning_rate": 4.1880341880341886e-05, "loss": 0.6947, "step": 246 }, { "epoch": 0.1264720942140297, "grad_norm": 0.18466190993785858, "learning_rate": 4.205128205128206e-05, "loss": 0.7033, "step": 247 }, { "epoch": 0.12698412698412698, "grad_norm": 0.19793455302715302, "learning_rate": 4.222222222222222e-05, "loss": 0.6921, "step": 248 }, { "epoch": 0.12749615975422426, "grad_norm": 0.308807909488678, "learning_rate": 4.239316239316239e-05, "loss": 0.7069, "step": 249 }, { "epoch": 0.12800819252432155, "grad_norm": 0.34367072582244873, "learning_rate": 4.2564102564102564e-05, "loss": 0.7084, "step": 250 }, { "epoch": 0.12852022529441884, "grad_norm": 0.2111034095287323, "learning_rate": 4.2735042735042735e-05, "loss": 0.7017, "step": 251 }, { "epoch": 0.12903225806451613, "grad_norm": 0.21684569120407104, "learning_rate": 4.2905982905982906e-05, "loss": 0.6992, "step": 252 }, { "epoch": 0.1295442908346134, "grad_norm": 0.2714485824108124, "learning_rate": 4.3076923076923084e-05, "loss": 0.7108, "step": 253 }, { "epoch": 0.1300563236047107, "grad_norm": 0.20336472988128662, "learning_rate": 4.324786324786325e-05, "loss": 0.7032, "step": 254 }, { "epoch": 0.130568356374808, "grad_norm": 0.17772671580314636, "learning_rate": 4.341880341880342e-05, "loss": 0.7238, "step": 255 }, { "epoch": 0.13108038914490527, "grad_norm": 0.18825297057628632, "learning_rate": 4.358974358974359e-05, "loss": 0.7, "step": 256 }, { "epoch": 0.13159242191500256, "grad_norm": 0.2909964621067047, "learning_rate": 4.376068376068376e-05, "loss": 0.698, "step": 257 }, { "epoch": 0.13210445468509985, "grad_norm": 0.2634066045284271, "learning_rate": 4.393162393162393e-05, "loss": 0.7214, "step": 258 }, { "epoch": 0.13261648745519714, "grad_norm": 0.2525797188282013, "learning_rate": 4.4102564102564104e-05, "loss": 0.6981, "step": 259 }, { "epoch": 0.13312852022529442, "grad_norm": 0.18515442311763763, "learning_rate": 4.4273504273504275e-05, "loss": 0.7023, "step": 260 }, { "epoch": 0.1336405529953917, "grad_norm": 0.22990725934505463, "learning_rate": 4.4444444444444447e-05, "loss": 0.7208, "step": 261 }, { "epoch": 0.134152585765489, "grad_norm": 0.3801679313182831, "learning_rate": 4.461538461538462e-05, "loss": 0.7065, "step": 262 }, { "epoch": 0.13466461853558628, "grad_norm": 0.23206549882888794, "learning_rate": 4.478632478632479e-05, "loss": 0.7083, "step": 263 }, { "epoch": 0.13517665130568357, "grad_norm": 0.2300211489200592, "learning_rate": 4.495726495726496e-05, "loss": 0.6896, "step": 264 }, { "epoch": 0.13568868407578086, "grad_norm": 0.21424394845962524, "learning_rate": 4.512820512820513e-05, "loss": 0.7275, "step": 265 }, { "epoch": 0.13620071684587814, "grad_norm": 0.1980319768190384, "learning_rate": 4.52991452991453e-05, "loss": 0.7157, "step": 266 }, { "epoch": 0.13671274961597543, "grad_norm": 0.21018141508102417, "learning_rate": 4.5470085470085474e-05, "loss": 0.7106, "step": 267 }, { "epoch": 0.13722478238607272, "grad_norm": 0.20382367074489594, "learning_rate": 4.5641025641025645e-05, "loss": 0.6916, "step": 268 }, { "epoch": 0.13773681515617, "grad_norm": 0.20072416961193085, "learning_rate": 4.581196581196581e-05, "loss": 0.697, "step": 269 }, { "epoch": 0.1382488479262673, "grad_norm": 0.19181309640407562, "learning_rate": 4.598290598290598e-05, "loss": 0.7071, "step": 270 }, { "epoch": 0.13876088069636458, "grad_norm": 0.2007455974817276, "learning_rate": 4.615384615384616e-05, "loss": 0.7122, "step": 271 }, { "epoch": 0.13927291346646187, "grad_norm": 0.19100092351436615, "learning_rate": 4.632478632478633e-05, "loss": 0.7065, "step": 272 }, { "epoch": 0.13978494623655913, "grad_norm": 0.274463951587677, "learning_rate": 4.64957264957265e-05, "loss": 0.6991, "step": 273 }, { "epoch": 0.1402969790066564, "grad_norm": 0.3282463848590851, "learning_rate": 4.666666666666667e-05, "loss": 0.6897, "step": 274 }, { "epoch": 0.1408090117767537, "grad_norm": 0.3429318070411682, "learning_rate": 4.683760683760684e-05, "loss": 0.6994, "step": 275 }, { "epoch": 0.141321044546851, "grad_norm": 0.31900084018707275, "learning_rate": 4.700854700854701e-05, "loss": 0.7002, "step": 276 }, { "epoch": 0.14183307731694828, "grad_norm": 0.3011816143989563, "learning_rate": 4.717948717948718e-05, "loss": 0.7231, "step": 277 }, { "epoch": 0.14234511008704556, "grad_norm": 0.238892063498497, "learning_rate": 4.735042735042735e-05, "loss": 0.6757, "step": 278 }, { "epoch": 0.14285714285714285, "grad_norm": 0.24854741990566254, "learning_rate": 4.752136752136752e-05, "loss": 0.7075, "step": 279 }, { "epoch": 0.14336917562724014, "grad_norm": 0.267141729593277, "learning_rate": 4.76923076923077e-05, "loss": 0.691, "step": 280 }, { "epoch": 0.14388120839733742, "grad_norm": 0.2801644504070282, "learning_rate": 4.786324786324787e-05, "loss": 0.6858, "step": 281 }, { "epoch": 0.1443932411674347, "grad_norm": 0.5701198577880859, "learning_rate": 4.803418803418804e-05, "loss": 0.7214, "step": 282 }, { "epoch": 0.144905273937532, "grad_norm": 0.3165575861930847, "learning_rate": 4.8205128205128205e-05, "loss": 0.721, "step": 283 }, { "epoch": 0.14541730670762928, "grad_norm": 0.2744433879852295, "learning_rate": 4.8376068376068376e-05, "loss": 0.7041, "step": 284 }, { "epoch": 0.14592933947772657, "grad_norm": 0.36085155606269836, "learning_rate": 4.854700854700855e-05, "loss": 0.6716, "step": 285 }, { "epoch": 0.14644137224782386, "grad_norm": 0.3339100480079651, "learning_rate": 4.871794871794872e-05, "loss": 0.7191, "step": 286 }, { "epoch": 0.14695340501792115, "grad_norm": 0.4351714849472046, "learning_rate": 4.888888888888889e-05, "loss": 0.7078, "step": 287 }, { "epoch": 0.14746543778801843, "grad_norm": 0.39258843660354614, "learning_rate": 4.905982905982906e-05, "loss": 0.6848, "step": 288 }, { "epoch": 0.14797747055811572, "grad_norm": 0.3089747428894043, "learning_rate": 4.923076923076924e-05, "loss": 0.6862, "step": 289 }, { "epoch": 0.148489503328213, "grad_norm": 0.27622178196907043, "learning_rate": 4.94017094017094e-05, "loss": 0.6864, "step": 290 }, { "epoch": 0.1490015360983103, "grad_norm": 0.40605440735816956, "learning_rate": 4.9572649572649575e-05, "loss": 0.6983, "step": 291 }, { "epoch": 0.14951356886840758, "grad_norm": 0.36792489886283875, "learning_rate": 4.9743589743589746e-05, "loss": 0.7061, "step": 292 }, { "epoch": 0.15002560163850487, "grad_norm": 0.2476658672094345, "learning_rate": 4.991452991452992e-05, "loss": 0.7197, "step": 293 }, { "epoch": 0.15053763440860216, "grad_norm": 0.3459942638874054, "learning_rate": 5.008547008547009e-05, "loss": 0.6877, "step": 294 }, { "epoch": 0.15104966717869944, "grad_norm": 0.435303658246994, "learning_rate": 5.025641025641026e-05, "loss": 0.6851, "step": 295 }, { "epoch": 0.15156169994879673, "grad_norm": 0.285854309797287, "learning_rate": 5.042735042735043e-05, "loss": 0.6638, "step": 296 }, { "epoch": 0.15207373271889402, "grad_norm": 0.4301801919937134, "learning_rate": 5.05982905982906e-05, "loss": 0.7086, "step": 297 }, { "epoch": 0.1525857654889913, "grad_norm": 0.34547773003578186, "learning_rate": 5.0769230769230766e-05, "loss": 0.6905, "step": 298 }, { "epoch": 0.1530977982590886, "grad_norm": 0.31745001673698425, "learning_rate": 5.094017094017094e-05, "loss": 0.6778, "step": 299 }, { "epoch": 0.15360983102918588, "grad_norm": 0.47908803820610046, "learning_rate": 5.111111111111111e-05, "loss": 0.7122, "step": 300 }, { "epoch": 0.15412186379928317, "grad_norm": 0.4313029944896698, "learning_rate": 5.128205128205128e-05, "loss": 0.7004, "step": 301 }, { "epoch": 0.15463389656938045, "grad_norm": 0.2768004536628723, "learning_rate": 5.145299145299145e-05, "loss": 0.6952, "step": 302 }, { "epoch": 0.15514592933947774, "grad_norm": 0.28131580352783203, "learning_rate": 5.162393162393163e-05, "loss": 0.6905, "step": 303 }, { "epoch": 0.15565796210957503, "grad_norm": 0.31095167994499207, "learning_rate": 5.17948717948718e-05, "loss": 0.7005, "step": 304 }, { "epoch": 0.15616999487967229, "grad_norm": 0.2696428894996643, "learning_rate": 5.196581196581197e-05, "loss": 0.6794, "step": 305 }, { "epoch": 0.15668202764976957, "grad_norm": 0.37772417068481445, "learning_rate": 5.213675213675214e-05, "loss": 0.7218, "step": 306 }, { "epoch": 0.15719406041986686, "grad_norm": 1.2969069480895996, "learning_rate": 5.230769230769231e-05, "loss": 0.7128, "step": 307 }, { "epoch": 0.15770609318996415, "grad_norm": 0.3942278325557709, "learning_rate": 5.2478632478632484e-05, "loss": 0.6723, "step": 308 }, { "epoch": 0.15821812596006143, "grad_norm": 0.47124919295310974, "learning_rate": 5.2649572649572655e-05, "loss": 0.6798, "step": 309 }, { "epoch": 0.15873015873015872, "grad_norm": 0.44106024503707886, "learning_rate": 5.2820512820512826e-05, "loss": 0.68, "step": 310 }, { "epoch": 0.159242191500256, "grad_norm": 0.47388142347335815, "learning_rate": 5.2991452991453e-05, "loss": 0.7049, "step": 311 }, { "epoch": 0.1597542242703533, "grad_norm": 0.4685438871383667, "learning_rate": 5.316239316239316e-05, "loss": 0.6843, "step": 312 }, { "epoch": 0.16026625704045058, "grad_norm": 0.42994019389152527, "learning_rate": 5.333333333333333e-05, "loss": 0.6782, "step": 313 }, { "epoch": 0.16077828981054787, "grad_norm": 0.4235561788082123, "learning_rate": 5.3504273504273504e-05, "loss": 0.7043, "step": 314 }, { "epoch": 0.16129032258064516, "grad_norm": 0.2660256028175354, "learning_rate": 5.3675213675213675e-05, "loss": 0.6751, "step": 315 }, { "epoch": 0.16180235535074244, "grad_norm": 0.4266120195388794, "learning_rate": 5.384615384615385e-05, "loss": 0.7066, "step": 316 }, { "epoch": 0.16231438812083973, "grad_norm": 0.5439596176147461, "learning_rate": 5.401709401709402e-05, "loss": 0.7074, "step": 317 }, { "epoch": 0.16282642089093702, "grad_norm": 0.4253218173980713, "learning_rate": 5.418803418803419e-05, "loss": 0.6837, "step": 318 }, { "epoch": 0.1633384536610343, "grad_norm": 0.38931718468666077, "learning_rate": 5.435897435897436e-05, "loss": 0.6736, "step": 319 }, { "epoch": 0.1638504864311316, "grad_norm": 0.5418940782546997, "learning_rate": 5.4529914529914525e-05, "loss": 0.687, "step": 320 }, { "epoch": 0.16436251920122888, "grad_norm": 0.40325435996055603, "learning_rate": 5.470085470085471e-05, "loss": 0.678, "step": 321 }, { "epoch": 0.16487455197132617, "grad_norm": 0.2839350998401642, "learning_rate": 5.487179487179488e-05, "loss": 0.6923, "step": 322 }, { "epoch": 0.16538658474142345, "grad_norm": 0.36034199595451355, "learning_rate": 5.504273504273505e-05, "loss": 0.6963, "step": 323 }, { "epoch": 0.16589861751152074, "grad_norm": 0.2954145073890686, "learning_rate": 5.521367521367522e-05, "loss": 0.6897, "step": 324 }, { "epoch": 0.16641065028161803, "grad_norm": 0.356808066368103, "learning_rate": 5.538461538461539e-05, "loss": 0.6851, "step": 325 }, { "epoch": 0.16692268305171531, "grad_norm": 0.2651332914829254, "learning_rate": 5.555555555555556e-05, "loss": 0.6596, "step": 326 }, { "epoch": 0.1674347158218126, "grad_norm": 0.3125380277633667, "learning_rate": 5.572649572649573e-05, "loss": 0.6676, "step": 327 }, { "epoch": 0.1679467485919099, "grad_norm": 0.41362297534942627, "learning_rate": 5.58974358974359e-05, "loss": 0.6791, "step": 328 }, { "epoch": 0.16845878136200718, "grad_norm": 0.3368385434150696, "learning_rate": 5.606837606837607e-05, "loss": 0.68, "step": 329 }, { "epoch": 0.16897081413210446, "grad_norm": 0.2944033145904541, "learning_rate": 5.623931623931624e-05, "loss": 0.7315, "step": 330 }, { "epoch": 0.16948284690220175, "grad_norm": 0.38459914922714233, "learning_rate": 5.6410256410256414e-05, "loss": 0.6999, "step": 331 }, { "epoch": 0.16999487967229904, "grad_norm": 0.4893406927585602, "learning_rate": 5.6581196581196585e-05, "loss": 0.7089, "step": 332 }, { "epoch": 0.17050691244239632, "grad_norm": 0.4632093608379364, "learning_rate": 5.6752136752136756e-05, "loss": 0.7115, "step": 333 }, { "epoch": 0.1710189452124936, "grad_norm": 0.31523972749710083, "learning_rate": 5.692307692307692e-05, "loss": 0.6861, "step": 334 }, { "epoch": 0.1715309779825909, "grad_norm": 0.2935820519924164, "learning_rate": 5.709401709401709e-05, "loss": 0.6801, "step": 335 }, { "epoch": 0.17204301075268819, "grad_norm": 0.37856343388557434, "learning_rate": 5.726495726495726e-05, "loss": 0.6874, "step": 336 }, { "epoch": 0.17255504352278545, "grad_norm": 0.3186506927013397, "learning_rate": 5.7435897435897434e-05, "loss": 0.6647, "step": 337 }, { "epoch": 0.17306707629288273, "grad_norm": 0.29061371088027954, "learning_rate": 5.7606837606837605e-05, "loss": 0.7043, "step": 338 }, { "epoch": 0.17357910906298002, "grad_norm": 0.3113435208797455, "learning_rate": 5.7777777777777776e-05, "loss": 0.694, "step": 339 }, { "epoch": 0.1740911418330773, "grad_norm": 0.3768657445907593, "learning_rate": 5.7948717948717954e-05, "loss": 0.6863, "step": 340 }, { "epoch": 0.1746031746031746, "grad_norm": 0.41343954205513, "learning_rate": 5.8119658119658126e-05, "loss": 0.6821, "step": 341 }, { "epoch": 0.17511520737327188, "grad_norm": 0.42961230874061584, "learning_rate": 5.82905982905983e-05, "loss": 0.6746, "step": 342 }, { "epoch": 0.17562724014336917, "grad_norm": 0.41212499141693115, "learning_rate": 5.846153846153847e-05, "loss": 0.6952, "step": 343 }, { "epoch": 0.17613927291346645, "grad_norm": 0.4328208565711975, "learning_rate": 5.863247863247864e-05, "loss": 0.6852, "step": 344 }, { "epoch": 0.17665130568356374, "grad_norm": 0.320133239030838, "learning_rate": 5.880341880341881e-05, "loss": 0.6775, "step": 345 }, { "epoch": 0.17716333845366103, "grad_norm": 0.6244208812713623, "learning_rate": 5.897435897435898e-05, "loss": 0.7035, "step": 346 }, { "epoch": 0.17767537122375832, "grad_norm": 0.523115873336792, "learning_rate": 5.9145299145299146e-05, "loss": 0.6929, "step": 347 }, { "epoch": 0.1781874039938556, "grad_norm": 0.48724666237831116, "learning_rate": 5.931623931623932e-05, "loss": 0.6832, "step": 348 }, { "epoch": 0.1786994367639529, "grad_norm": 0.3848056197166443, "learning_rate": 5.948717948717949e-05, "loss": 0.6818, "step": 349 }, { "epoch": 0.17921146953405018, "grad_norm": 0.5351191759109497, "learning_rate": 5.965811965811966e-05, "loss": 0.7131, "step": 350 }, { "epoch": 0.17972350230414746, "grad_norm": 0.4643009901046753, "learning_rate": 5.982905982905983e-05, "loss": 0.6889, "step": 351 }, { "epoch": 0.18023553507424475, "grad_norm": 0.37807878851890564, "learning_rate": 6e-05, "loss": 0.6859, "step": 352 }, { "epoch": 0.18074756784434204, "grad_norm": 0.34994423389434814, "learning_rate": 6.017094017094017e-05, "loss": 0.7041, "step": 353 }, { "epoch": 0.18125960061443933, "grad_norm": 0.32685568928718567, "learning_rate": 6.0341880341880344e-05, "loss": 0.6962, "step": 354 }, { "epoch": 0.1817716333845366, "grad_norm": 0.347738116979599, "learning_rate": 6.0512820512820515e-05, "loss": 0.6659, "step": 355 }, { "epoch": 0.1822836661546339, "grad_norm": 0.37150371074676514, "learning_rate": 6.068376068376068e-05, "loss": 0.7003, "step": 356 }, { "epoch": 0.1827956989247312, "grad_norm": 0.33828142285346985, "learning_rate": 6.085470085470085e-05, "loss": 0.6896, "step": 357 }, { "epoch": 0.18330773169482847, "grad_norm": 0.3304486572742462, "learning_rate": 6.1025641025641035e-05, "loss": 0.6865, "step": 358 }, { "epoch": 0.18381976446492576, "grad_norm": 0.2963810861110687, "learning_rate": 6.11965811965812e-05, "loss": 0.6872, "step": 359 }, { "epoch": 0.18433179723502305, "grad_norm": 0.29512467980384827, "learning_rate": 6.136752136752138e-05, "loss": 0.6847, "step": 360 }, { "epoch": 0.18484383000512034, "grad_norm": 0.3215121626853943, "learning_rate": 6.153846153846155e-05, "loss": 0.6658, "step": 361 }, { "epoch": 0.18535586277521762, "grad_norm": 0.3445289433002472, "learning_rate": 6.170940170940172e-05, "loss": 0.6785, "step": 362 }, { "epoch": 0.1858678955453149, "grad_norm": 0.28570419549942017, "learning_rate": 6.188034188034189e-05, "loss": 0.674, "step": 363 }, { "epoch": 0.1863799283154122, "grad_norm": 0.3057447373867035, "learning_rate": 6.205128205128206e-05, "loss": 0.6517, "step": 364 }, { "epoch": 0.18689196108550948, "grad_norm": 0.3129388093948364, "learning_rate": 6.222222222222222e-05, "loss": 0.6831, "step": 365 }, { "epoch": 0.18740399385560677, "grad_norm": 0.3431328535079956, "learning_rate": 6.239316239316239e-05, "loss": 0.6862, "step": 366 }, { "epoch": 0.18791602662570406, "grad_norm": 0.4060856103897095, "learning_rate": 6.256410256410256e-05, "loss": 0.6613, "step": 367 }, { "epoch": 0.18842805939580134, "grad_norm": 0.4822655916213989, "learning_rate": 6.273504273504273e-05, "loss": 0.6865, "step": 368 }, { "epoch": 0.1889400921658986, "grad_norm": 0.48503178358078003, "learning_rate": 6.29059829059829e-05, "loss": 0.7056, "step": 369 }, { "epoch": 0.1894521249359959, "grad_norm": 0.36613255739212036, "learning_rate": 6.307692307692308e-05, "loss": 0.6715, "step": 370 }, { "epoch": 0.18996415770609318, "grad_norm": 0.3731935918331146, "learning_rate": 6.324786324786325e-05, "loss": 0.6928, "step": 371 }, { "epoch": 0.19047619047619047, "grad_norm": 0.4717727601528168, "learning_rate": 6.341880341880342e-05, "loss": 0.6938, "step": 372 }, { "epoch": 0.19098822324628775, "grad_norm": 0.5181594491004944, "learning_rate": 6.358974358974359e-05, "loss": 0.6691, "step": 373 }, { "epoch": 0.19150025601638504, "grad_norm": 0.4847472906112671, "learning_rate": 6.376068376068376e-05, "loss": 0.6834, "step": 374 }, { "epoch": 0.19201228878648233, "grad_norm": 0.4678697884082794, "learning_rate": 6.393162393162393e-05, "loss": 0.6777, "step": 375 }, { "epoch": 0.19252432155657961, "grad_norm": 0.3996281921863556, "learning_rate": 6.410256410256412e-05, "loss": 0.6646, "step": 376 }, { "epoch": 0.1930363543266769, "grad_norm": 0.3921671509742737, "learning_rate": 6.427350427350429e-05, "loss": 0.6791, "step": 377 }, { "epoch": 0.1935483870967742, "grad_norm": 0.4808714985847473, "learning_rate": 6.444444444444446e-05, "loss": 0.6665, "step": 378 }, { "epoch": 0.19406041986687148, "grad_norm": 0.5599377155303955, "learning_rate": 6.461538461538462e-05, "loss": 0.6842, "step": 379 }, { "epoch": 0.19457245263696876, "grad_norm": 0.5299193859100342, "learning_rate": 6.478632478632479e-05, "loss": 0.6891, "step": 380 }, { "epoch": 0.19508448540706605, "grad_norm": 0.589796781539917, "learning_rate": 6.495726495726496e-05, "loss": 0.6463, "step": 381 }, { "epoch": 0.19559651817716334, "grad_norm": 0.5797624588012695, "learning_rate": 6.512820512820513e-05, "loss": 0.6538, "step": 382 }, { "epoch": 0.19610855094726062, "grad_norm": 0.3951943814754486, "learning_rate": 6.52991452991453e-05, "loss": 0.663, "step": 383 }, { "epoch": 0.1966205837173579, "grad_norm": 0.44175034761428833, "learning_rate": 6.547008547008547e-05, "loss": 0.6726, "step": 384 }, { "epoch": 0.1971326164874552, "grad_norm": 0.5440134406089783, "learning_rate": 6.564102564102564e-05, "loss": 0.6657, "step": 385 }, { "epoch": 0.19764464925755248, "grad_norm": 0.5367993116378784, "learning_rate": 6.581196581196581e-05, "loss": 0.6526, "step": 386 }, { "epoch": 0.19815668202764977, "grad_norm": 0.5642850399017334, "learning_rate": 6.598290598290599e-05, "loss": 0.6555, "step": 387 }, { "epoch": 0.19866871479774706, "grad_norm": 0.3553987145423889, "learning_rate": 6.615384615384616e-05, "loss": 0.66, "step": 388 }, { "epoch": 0.19918074756784435, "grad_norm": 0.36254915595054626, "learning_rate": 6.632478632478633e-05, "loss": 0.6889, "step": 389 }, { "epoch": 0.19969278033794163, "grad_norm": 0.45095017552375793, "learning_rate": 6.64957264957265e-05, "loss": 0.6725, "step": 390 }, { "epoch": 0.20020481310803892, "grad_norm": 0.7628677487373352, "learning_rate": 6.666666666666667e-05, "loss": 0.6672, "step": 391 }, { "epoch": 0.2007168458781362, "grad_norm": 1.0749173164367676, "learning_rate": 6.683760683760684e-05, "loss": 0.6844, "step": 392 }, { "epoch": 0.2012288786482335, "grad_norm": 0.9824026226997375, "learning_rate": 6.700854700854701e-05, "loss": 0.7006, "step": 393 }, { "epoch": 0.20174091141833078, "grad_norm": 0.6182637214660645, "learning_rate": 6.717948717948718e-05, "loss": 0.6652, "step": 394 }, { "epoch": 0.20225294418842807, "grad_norm": 0.5594872236251831, "learning_rate": 6.735042735042735e-05, "loss": 0.6725, "step": 395 }, { "epoch": 0.20276497695852536, "grad_norm": 0.5852798223495483, "learning_rate": 6.752136752136753e-05, "loss": 0.6988, "step": 396 }, { "epoch": 0.20327700972862264, "grad_norm": 0.798346996307373, "learning_rate": 6.76923076923077e-05, "loss": 0.7065, "step": 397 }, { "epoch": 0.20378904249871993, "grad_norm": 0.8025190830230713, "learning_rate": 6.786324786324787e-05, "loss": 0.6989, "step": 398 }, { "epoch": 0.20430107526881722, "grad_norm": 1.0600751638412476, "learning_rate": 6.803418803418804e-05, "loss": 0.692, "step": 399 }, { "epoch": 0.2048131080389145, "grad_norm": 0.6275109648704529, "learning_rate": 6.820512820512821e-05, "loss": 0.6751, "step": 400 }, { "epoch": 0.20532514080901176, "grad_norm": 0.7343604564666748, "learning_rate": 6.837606837606838e-05, "loss": 0.6693, "step": 401 }, { "epoch": 0.20583717357910905, "grad_norm": 0.9103212356567383, "learning_rate": 6.854700854700855e-05, "loss": 0.6652, "step": 402 }, { "epoch": 0.20634920634920634, "grad_norm": 0.617779552936554, "learning_rate": 6.871794871794872e-05, "loss": 0.6537, "step": 403 }, { "epoch": 0.20686123911930362, "grad_norm": 0.7276797294616699, "learning_rate": 6.88888888888889e-05, "loss": 0.7035, "step": 404 }, { "epoch": 0.2073732718894009, "grad_norm": 0.7717972993850708, "learning_rate": 6.905982905982907e-05, "loss": 0.6909, "step": 405 }, { "epoch": 0.2078853046594982, "grad_norm": 0.5293760895729065, "learning_rate": 6.923076923076924e-05, "loss": 0.6683, "step": 406 }, { "epoch": 0.2083973374295955, "grad_norm": 0.4664803445339203, "learning_rate": 6.940170940170941e-05, "loss": 0.6728, "step": 407 }, { "epoch": 0.20890937019969277, "grad_norm": 0.46248871088027954, "learning_rate": 6.957264957264958e-05, "loss": 0.6821, "step": 408 }, { "epoch": 0.20942140296979006, "grad_norm": 0.6168457865715027, "learning_rate": 6.974358974358974e-05, "loss": 0.6502, "step": 409 }, { "epoch": 0.20993343573988735, "grad_norm": 0.679824709892273, "learning_rate": 6.991452991452991e-05, "loss": 0.6954, "step": 410 }, { "epoch": 0.21044546850998463, "grad_norm": 0.6625686287879944, "learning_rate": 7.008547008547008e-05, "loss": 0.665, "step": 411 }, { "epoch": 0.21095750128008192, "grad_norm": 0.5553739070892334, "learning_rate": 7.025641025641025e-05, "loss": 0.6832, "step": 412 }, { "epoch": 0.2114695340501792, "grad_norm": 0.6182240843772888, "learning_rate": 7.042735042735044e-05, "loss": 0.699, "step": 413 }, { "epoch": 0.2119815668202765, "grad_norm": 0.4072546362876892, "learning_rate": 7.05982905982906e-05, "loss": 0.6685, "step": 414 }, { "epoch": 0.21249359959037378, "grad_norm": 0.6117467880249023, "learning_rate": 7.076923076923078e-05, "loss": 0.6455, "step": 415 }, { "epoch": 0.21300563236047107, "grad_norm": 0.5120782256126404, "learning_rate": 7.094017094017095e-05, "loss": 0.6971, "step": 416 }, { "epoch": 0.21351766513056836, "grad_norm": 0.42435863614082336, "learning_rate": 7.111111111111112e-05, "loss": 0.6927, "step": 417 }, { "epoch": 0.21402969790066564, "grad_norm": 0.5159029960632324, "learning_rate": 7.128205128205129e-05, "loss": 0.6665, "step": 418 }, { "epoch": 0.21454173067076293, "grad_norm": 0.4846355617046356, "learning_rate": 7.145299145299146e-05, "loss": 0.6766, "step": 419 }, { "epoch": 0.21505376344086022, "grad_norm": 0.41063547134399414, "learning_rate": 7.162393162393163e-05, "loss": 0.6774, "step": 420 }, { "epoch": 0.2155657962109575, "grad_norm": 0.4100075662136078, "learning_rate": 7.17948717948718e-05, "loss": 0.6983, "step": 421 }, { "epoch": 0.2160778289810548, "grad_norm": 0.5213816165924072, "learning_rate": 7.196581196581196e-05, "loss": 0.6712, "step": 422 }, { "epoch": 0.21658986175115208, "grad_norm": 0.4849770665168762, "learning_rate": 7.213675213675213e-05, "loss": 0.6613, "step": 423 }, { "epoch": 0.21710189452124937, "grad_norm": 0.5123862028121948, "learning_rate": 7.23076923076923e-05, "loss": 0.7021, "step": 424 }, { "epoch": 0.21761392729134665, "grad_norm": 0.6538012027740479, "learning_rate": 7.247863247863248e-05, "loss": 0.7158, "step": 425 }, { "epoch": 0.21812596006144394, "grad_norm": 0.7184160947799683, "learning_rate": 7.264957264957265e-05, "loss": 0.6705, "step": 426 }, { "epoch": 0.21863799283154123, "grad_norm": 0.552477240562439, "learning_rate": 7.282051282051282e-05, "loss": 0.6799, "step": 427 }, { "epoch": 0.21915002560163852, "grad_norm": 0.40632468461990356, "learning_rate": 7.299145299145299e-05, "loss": 0.6754, "step": 428 }, { "epoch": 0.2196620583717358, "grad_norm": 0.60041344165802, "learning_rate": 7.316239316239316e-05, "loss": 0.6761, "step": 429 }, { "epoch": 0.2201740911418331, "grad_norm": 0.6998701095581055, "learning_rate": 7.333333333333333e-05, "loss": 0.652, "step": 430 }, { "epoch": 0.22068612391193038, "grad_norm": 0.6453064680099487, "learning_rate": 7.350427350427352e-05, "loss": 0.6535, "step": 431 }, { "epoch": 0.22119815668202766, "grad_norm": 0.5347617268562317, "learning_rate": 7.367521367521369e-05, "loss": 0.7023, "step": 432 }, { "epoch": 0.22171018945212492, "grad_norm": 0.6610273122787476, "learning_rate": 7.384615384615386e-05, "loss": 0.686, "step": 433 }, { "epoch": 0.2222222222222222, "grad_norm": 0.5576261878013611, "learning_rate": 7.401709401709403e-05, "loss": 0.6686, "step": 434 }, { "epoch": 0.2227342549923195, "grad_norm": 0.861988365650177, "learning_rate": 7.41880341880342e-05, "loss": 0.6739, "step": 435 }, { "epoch": 0.22324628776241678, "grad_norm": 0.858404278755188, "learning_rate": 7.435897435897436e-05, "loss": 0.6937, "step": 436 }, { "epoch": 0.22375832053251407, "grad_norm": 0.7285115122795105, "learning_rate": 7.452991452991453e-05, "loss": 0.6679, "step": 437 }, { "epoch": 0.22427035330261136, "grad_norm": 0.6099557876586914, "learning_rate": 7.47008547008547e-05, "loss": 0.6666, "step": 438 }, { "epoch": 0.22478238607270865, "grad_norm": 0.5294449925422668, "learning_rate": 7.487179487179487e-05, "loss": 0.6586, "step": 439 }, { "epoch": 0.22529441884280593, "grad_norm": 0.609220027923584, "learning_rate": 7.504273504273504e-05, "loss": 0.6512, "step": 440 }, { "epoch": 0.22580645161290322, "grad_norm": 0.5738632082939148, "learning_rate": 7.521367521367521e-05, "loss": 0.6779, "step": 441 }, { "epoch": 0.2263184843830005, "grad_norm": 0.5253649353981018, "learning_rate": 7.538461538461539e-05, "loss": 0.6935, "step": 442 }, { "epoch": 0.2268305171530978, "grad_norm": 0.4972066283226013, "learning_rate": 7.555555555555556e-05, "loss": 0.6727, "step": 443 }, { "epoch": 0.22734254992319508, "grad_norm": 0.47770798206329346, "learning_rate": 7.572649572649573e-05, "loss": 0.6737, "step": 444 }, { "epoch": 0.22785458269329237, "grad_norm": 0.5255363583564758, "learning_rate": 7.58974358974359e-05, "loss": 0.69, "step": 445 }, { "epoch": 0.22836661546338966, "grad_norm": 0.6584864854812622, "learning_rate": 7.606837606837607e-05, "loss": 0.6775, "step": 446 }, { "epoch": 0.22887864823348694, "grad_norm": 0.5667986273765564, "learning_rate": 7.623931623931624e-05, "loss": 0.679, "step": 447 }, { "epoch": 0.22939068100358423, "grad_norm": 0.45180562138557434, "learning_rate": 7.641025641025641e-05, "loss": 0.6799, "step": 448 }, { "epoch": 0.22990271377368152, "grad_norm": 0.6342997550964355, "learning_rate": 7.65811965811966e-05, "loss": 0.6971, "step": 449 }, { "epoch": 0.2304147465437788, "grad_norm": 0.5295264720916748, "learning_rate": 7.675213675213675e-05, "loss": 0.6751, "step": 450 }, { "epoch": 0.2309267793138761, "grad_norm": 0.7089836597442627, "learning_rate": 7.692307692307693e-05, "loss": 0.6848, "step": 451 }, { "epoch": 0.23143881208397338, "grad_norm": 0.8291141390800476, "learning_rate": 7.70940170940171e-05, "loss": 0.6596, "step": 452 }, { "epoch": 0.23195084485407066, "grad_norm": 1.1292935609817505, "learning_rate": 7.726495726495727e-05, "loss": 0.6731, "step": 453 }, { "epoch": 0.23246287762416795, "grad_norm": 0.7055255174636841, "learning_rate": 7.743589743589744e-05, "loss": 0.6938, "step": 454 }, { "epoch": 0.23297491039426524, "grad_norm": 0.6519927382469177, "learning_rate": 7.760683760683761e-05, "loss": 0.708, "step": 455 }, { "epoch": 0.23348694316436253, "grad_norm": 0.7521433234214783, "learning_rate": 7.777777777777778e-05, "loss": 0.7017, "step": 456 }, { "epoch": 0.2339989759344598, "grad_norm": 0.5813261270523071, "learning_rate": 7.794871794871795e-05, "loss": 0.6778, "step": 457 }, { "epoch": 0.2345110087045571, "grad_norm": 0.6564371585845947, "learning_rate": 7.811965811965812e-05, "loss": 0.6765, "step": 458 }, { "epoch": 0.2350230414746544, "grad_norm": 0.6506094336509705, "learning_rate": 7.82905982905983e-05, "loss": 0.688, "step": 459 }, { "epoch": 0.23553507424475167, "grad_norm": 0.623616099357605, "learning_rate": 7.846153846153847e-05, "loss": 0.6908, "step": 460 }, { "epoch": 0.23604710701484896, "grad_norm": 0.5556738376617432, "learning_rate": 7.863247863247864e-05, "loss": 0.6923, "step": 461 }, { "epoch": 0.23655913978494625, "grad_norm": 0.5045777559280396, "learning_rate": 7.880341880341881e-05, "loss": 0.6525, "step": 462 }, { "epoch": 0.23707117255504354, "grad_norm": 0.4828222990036011, "learning_rate": 7.897435897435898e-05, "loss": 0.6646, "step": 463 }, { "epoch": 0.23758320532514082, "grad_norm": 0.4571791887283325, "learning_rate": 7.914529914529915e-05, "loss": 0.6618, "step": 464 }, { "epoch": 0.23809523809523808, "grad_norm": 0.43925872445106506, "learning_rate": 7.931623931623932e-05, "loss": 0.6587, "step": 465 }, { "epoch": 0.23860727086533537, "grad_norm": 0.5369096994400024, "learning_rate": 7.948717948717948e-05, "loss": 0.6814, "step": 466 }, { "epoch": 0.23911930363543266, "grad_norm": 0.5117824673652649, "learning_rate": 7.965811965811965e-05, "loss": 0.6678, "step": 467 }, { "epoch": 0.23963133640552994, "grad_norm": 0.3860718905925751, "learning_rate": 7.982905982905984e-05, "loss": 0.6834, "step": 468 }, { "epoch": 0.24014336917562723, "grad_norm": 0.4532128572463989, "learning_rate": 8e-05, "loss": 0.6954, "step": 469 }, { "epoch": 0.24065540194572452, "grad_norm": 0.45127370953559875, "learning_rate": 8.017094017094018e-05, "loss": 0.6575, "step": 470 }, { "epoch": 0.2411674347158218, "grad_norm": 0.5474525094032288, "learning_rate": 8.034188034188035e-05, "loss": 0.6754, "step": 471 }, { "epoch": 0.2416794674859191, "grad_norm": 0.6400313377380371, "learning_rate": 8.051282051282052e-05, "loss": 0.6578, "step": 472 }, { "epoch": 0.24219150025601638, "grad_norm": 0.7089480757713318, "learning_rate": 8.068376068376069e-05, "loss": 0.6638, "step": 473 }, { "epoch": 0.24270353302611367, "grad_norm": 0.6121129393577576, "learning_rate": 8.085470085470086e-05, "loss": 0.6878, "step": 474 }, { "epoch": 0.24321556579621095, "grad_norm": 0.6018154621124268, "learning_rate": 8.102564102564103e-05, "loss": 0.675, "step": 475 }, { "epoch": 0.24372759856630824, "grad_norm": 0.818397581577301, "learning_rate": 8.11965811965812e-05, "loss": 0.6782, "step": 476 }, { "epoch": 0.24423963133640553, "grad_norm": 1.017034649848938, "learning_rate": 8.136752136752138e-05, "loss": 0.6873, "step": 477 }, { "epoch": 0.24475166410650281, "grad_norm": 1.0873260498046875, "learning_rate": 8.153846153846155e-05, "loss": 0.7151, "step": 478 }, { "epoch": 0.2452636968766001, "grad_norm": 0.7078177332878113, "learning_rate": 8.170940170940172e-05, "loss": 0.6787, "step": 479 }, { "epoch": 0.2457757296466974, "grad_norm": 0.4512692093849182, "learning_rate": 8.188034188034188e-05, "loss": 0.6517, "step": 480 }, { "epoch": 0.24628776241679468, "grad_norm": 0.5373954772949219, "learning_rate": 8.205128205128205e-05, "loss": 0.6885, "step": 481 }, { "epoch": 0.24679979518689196, "grad_norm": 0.41701793670654297, "learning_rate": 8.222222222222222e-05, "loss": 0.6882, "step": 482 }, { "epoch": 0.24731182795698925, "grad_norm": 0.6875032782554626, "learning_rate": 8.239316239316239e-05, "loss": 0.6611, "step": 483 }, { "epoch": 0.24782386072708654, "grad_norm": 0.7074156403541565, "learning_rate": 8.256410256410256e-05, "loss": 0.681, "step": 484 }, { "epoch": 0.24833589349718382, "grad_norm": 0.8743919730186462, "learning_rate": 8.273504273504273e-05, "loss": 0.7009, "step": 485 }, { "epoch": 0.2488479262672811, "grad_norm": 1.2638086080551147, "learning_rate": 8.290598290598292e-05, "loss": 0.6922, "step": 486 }, { "epoch": 0.2493599590373784, "grad_norm": 0.6606730818748474, "learning_rate": 8.307692307692309e-05, "loss": 0.6887, "step": 487 }, { "epoch": 0.24987199180747569, "grad_norm": 0.491021990776062, "learning_rate": 8.324786324786326e-05, "loss": 0.6764, "step": 488 }, { "epoch": 0.250384024577573, "grad_norm": 0.842737078666687, "learning_rate": 8.341880341880343e-05, "loss": 0.6952, "step": 489 }, { "epoch": 0.25089605734767023, "grad_norm": 0.896513044834137, "learning_rate": 8.35897435897436e-05, "loss": 0.688, "step": 490 }, { "epoch": 0.25140809011776755, "grad_norm": 0.7350515723228455, "learning_rate": 8.376068376068377e-05, "loss": 0.6775, "step": 491 }, { "epoch": 0.2519201228878648, "grad_norm": 0.6392162442207336, "learning_rate": 8.393162393162394e-05, "loss": 0.6681, "step": 492 }, { "epoch": 0.2524321556579621, "grad_norm": 0.5157011151313782, "learning_rate": 8.410256410256411e-05, "loss": 0.667, "step": 493 }, { "epoch": 0.2529441884280594, "grad_norm": 0.7303450107574463, "learning_rate": 8.427350427350427e-05, "loss": 0.6526, "step": 494 }, { "epoch": 0.2534562211981567, "grad_norm": 0.8630701899528503, "learning_rate": 8.444444444444444e-05, "loss": 0.6695, "step": 495 }, { "epoch": 0.25396825396825395, "grad_norm": 0.6708585023880005, "learning_rate": 8.461538461538461e-05, "loss": 0.6417, "step": 496 }, { "epoch": 0.25448028673835127, "grad_norm": 0.5876595973968506, "learning_rate": 8.478632478632479e-05, "loss": 0.6616, "step": 497 }, { "epoch": 0.25499231950844853, "grad_norm": 0.49093326926231384, "learning_rate": 8.495726495726496e-05, "loss": 0.6506, "step": 498 }, { "epoch": 0.25550435227854584, "grad_norm": 0.6547732353210449, "learning_rate": 8.512820512820513e-05, "loss": 0.7048, "step": 499 }, { "epoch": 0.2560163850486431, "grad_norm": 0.9222532510757446, "learning_rate": 8.52991452991453e-05, "loss": 0.6717, "step": 500 }, { "epoch": 0.2565284178187404, "grad_norm": 0.8461019992828369, "learning_rate": 8.547008547008547e-05, "loss": 0.6771, "step": 501 }, { "epoch": 0.2570404505888377, "grad_norm": 0.622621476650238, "learning_rate": 8.564102564102564e-05, "loss": 0.6815, "step": 502 }, { "epoch": 0.257552483358935, "grad_norm": 0.44409164786338806, "learning_rate": 8.581196581196581e-05, "loss": 0.6814, "step": 503 }, { "epoch": 0.25806451612903225, "grad_norm": 0.5412179827690125, "learning_rate": 8.5982905982906e-05, "loss": 0.6511, "step": 504 }, { "epoch": 0.25857654889912957, "grad_norm": 0.6486039757728577, "learning_rate": 8.615384615384617e-05, "loss": 0.6865, "step": 505 }, { "epoch": 0.2590885816692268, "grad_norm": 0.6729030013084412, "learning_rate": 8.632478632478634e-05, "loss": 0.6736, "step": 506 }, { "epoch": 0.25960061443932414, "grad_norm": 0.6669950485229492, "learning_rate": 8.64957264957265e-05, "loss": 0.6878, "step": 507 }, { "epoch": 0.2601126472094214, "grad_norm": 0.60361248254776, "learning_rate": 8.666666666666667e-05, "loss": 0.6954, "step": 508 }, { "epoch": 0.2606246799795187, "grad_norm": 0.44707047939300537, "learning_rate": 8.683760683760684e-05, "loss": 0.665, "step": 509 }, { "epoch": 0.261136712749616, "grad_norm": 0.4156092703342438, "learning_rate": 8.700854700854701e-05, "loss": 0.6606, "step": 510 }, { "epoch": 0.2616487455197133, "grad_norm": 0.577501654624939, "learning_rate": 8.717948717948718e-05, "loss": 0.6512, "step": 511 }, { "epoch": 0.26216077828981055, "grad_norm": 0.7895998358726501, "learning_rate": 8.735042735042735e-05, "loss": 0.6407, "step": 512 }, { "epoch": 0.2626728110599078, "grad_norm": 0.786401093006134, "learning_rate": 8.752136752136752e-05, "loss": 0.6784, "step": 513 }, { "epoch": 0.2631848438300051, "grad_norm": 0.8058803677558899, "learning_rate": 8.76923076923077e-05, "loss": 0.6514, "step": 514 }, { "epoch": 0.2636968766001024, "grad_norm": 0.8664082884788513, "learning_rate": 8.786324786324787e-05, "loss": 0.6712, "step": 515 }, { "epoch": 0.2642089093701997, "grad_norm": 0.7325778007507324, "learning_rate": 8.803418803418804e-05, "loss": 0.6643, "step": 516 }, { "epoch": 0.26472094214029696, "grad_norm": 0.5215964317321777, "learning_rate": 8.820512820512821e-05, "loss": 0.6548, "step": 517 }, { "epoch": 0.26523297491039427, "grad_norm": 0.6621806025505066, "learning_rate": 8.837606837606838e-05, "loss": 0.6571, "step": 518 }, { "epoch": 0.26574500768049153, "grad_norm": 0.8177564144134521, "learning_rate": 8.854700854700855e-05, "loss": 0.6761, "step": 519 }, { "epoch": 0.26625704045058884, "grad_norm": 0.7923840880393982, "learning_rate": 8.871794871794872e-05, "loss": 0.6743, "step": 520 }, { "epoch": 0.2667690732206861, "grad_norm": 0.6678850054740906, "learning_rate": 8.888888888888889e-05, "loss": 0.6796, "step": 521 }, { "epoch": 0.2672811059907834, "grad_norm": 0.631187915802002, "learning_rate": 8.905982905982906e-05, "loss": 0.6683, "step": 522 }, { "epoch": 0.2677931387608807, "grad_norm": 0.6493726968765259, "learning_rate": 8.923076923076924e-05, "loss": 0.6674, "step": 523 }, { "epoch": 0.268305171530978, "grad_norm": 0.8056166172027588, "learning_rate": 8.94017094017094e-05, "loss": 0.6512, "step": 524 }, { "epoch": 0.26881720430107525, "grad_norm": 0.638189435005188, "learning_rate": 8.957264957264958e-05, "loss": 0.6953, "step": 525 }, { "epoch": 0.26932923707117257, "grad_norm": 0.5925140976905823, "learning_rate": 8.974358974358975e-05, "loss": 0.6717, "step": 526 }, { "epoch": 0.2698412698412698, "grad_norm": 0.5173265933990479, "learning_rate": 8.991452991452992e-05, "loss": 0.6828, "step": 527 }, { "epoch": 0.27035330261136714, "grad_norm": 0.5081897377967834, "learning_rate": 9.008547008547009e-05, "loss": 0.6657, "step": 528 }, { "epoch": 0.2708653353814644, "grad_norm": 0.7393469214439392, "learning_rate": 9.025641025641026e-05, "loss": 0.7067, "step": 529 }, { "epoch": 0.2713773681515617, "grad_norm": 0.7789038419723511, "learning_rate": 9.042735042735043e-05, "loss": 0.6337, "step": 530 }, { "epoch": 0.271889400921659, "grad_norm": 0.7243101596832275, "learning_rate": 9.05982905982906e-05, "loss": 0.6648, "step": 531 }, { "epoch": 0.2724014336917563, "grad_norm": 0.47588682174682617, "learning_rate": 9.076923076923078e-05, "loss": 0.6633, "step": 532 }, { "epoch": 0.27291346646185355, "grad_norm": 0.5964364409446716, "learning_rate": 9.094017094017095e-05, "loss": 0.6879, "step": 533 }, { "epoch": 0.27342549923195086, "grad_norm": 0.9859206080436707, "learning_rate": 9.111111111111112e-05, "loss": 0.6728, "step": 534 }, { "epoch": 0.2739375320020481, "grad_norm": 1.4211821556091309, "learning_rate": 9.128205128205129e-05, "loss": 0.669, "step": 535 }, { "epoch": 0.27444956477214544, "grad_norm": 0.6588103175163269, "learning_rate": 9.145299145299146e-05, "loss": 0.6503, "step": 536 }, { "epoch": 0.2749615975422427, "grad_norm": 0.814889669418335, "learning_rate": 9.162393162393162e-05, "loss": 0.6718, "step": 537 }, { "epoch": 0.27547363031234, "grad_norm": 1.1814385652542114, "learning_rate": 9.179487179487179e-05, "loss": 0.6748, "step": 538 }, { "epoch": 0.27598566308243727, "grad_norm": 0.7341762781143188, "learning_rate": 9.196581196581196e-05, "loss": 0.6793, "step": 539 }, { "epoch": 0.2764976958525346, "grad_norm": 0.8702532649040222, "learning_rate": 9.213675213675213e-05, "loss": 0.6645, "step": 540 }, { "epoch": 0.27700972862263185, "grad_norm": 1.0905200242996216, "learning_rate": 9.230769230769232e-05, "loss": 0.6708, "step": 541 }, { "epoch": 0.27752176139272916, "grad_norm": 0.9810071587562561, "learning_rate": 9.247863247863249e-05, "loss": 0.6772, "step": 542 }, { "epoch": 0.2780337941628264, "grad_norm": 0.7395370602607727, "learning_rate": 9.264957264957266e-05, "loss": 0.6944, "step": 543 }, { "epoch": 0.27854582693292373, "grad_norm": 1.014681100845337, "learning_rate": 9.282051282051283e-05, "loss": 0.6643, "step": 544 }, { "epoch": 0.279057859703021, "grad_norm": 1.0808416604995728, "learning_rate": 9.2991452991453e-05, "loss": 0.6617, "step": 545 }, { "epoch": 0.27956989247311825, "grad_norm": 1.4037442207336426, "learning_rate": 9.316239316239317e-05, "loss": 0.6839, "step": 546 }, { "epoch": 0.28008192524321557, "grad_norm": 0.7871617078781128, "learning_rate": 9.333333333333334e-05, "loss": 0.6808, "step": 547 }, { "epoch": 0.2805939580133128, "grad_norm": 0.9698381423950195, "learning_rate": 9.350427350427351e-05, "loss": 0.6738, "step": 548 }, { "epoch": 0.28110599078341014, "grad_norm": 0.8822514414787292, "learning_rate": 9.367521367521369e-05, "loss": 0.6713, "step": 549 }, { "epoch": 0.2816180235535074, "grad_norm": 0.8566637635231018, "learning_rate": 9.384615384615386e-05, "loss": 0.6757, "step": 550 }, { "epoch": 0.2821300563236047, "grad_norm": 1.0609115362167358, "learning_rate": 9.401709401709401e-05, "loss": 0.7002, "step": 551 }, { "epoch": 0.282642089093702, "grad_norm": 1.1283247470855713, "learning_rate": 9.418803418803419e-05, "loss": 0.6637, "step": 552 }, { "epoch": 0.2831541218637993, "grad_norm": 0.683198094367981, "learning_rate": 9.435897435897436e-05, "loss": 0.6554, "step": 553 }, { "epoch": 0.28366615463389655, "grad_norm": 0.9595058560371399, "learning_rate": 9.452991452991453e-05, "loss": 0.6582, "step": 554 }, { "epoch": 0.28417818740399386, "grad_norm": 1.520620584487915, "learning_rate": 9.47008547008547e-05, "loss": 0.6732, "step": 555 }, { "epoch": 0.2846902201740911, "grad_norm": 0.6571989059448242, "learning_rate": 9.487179487179487e-05, "loss": 0.6794, "step": 556 }, { "epoch": 0.28520225294418844, "grad_norm": 1.5178425312042236, "learning_rate": 9.504273504273504e-05, "loss": 0.6791, "step": 557 }, { "epoch": 0.2857142857142857, "grad_norm": 0.5356901288032532, "learning_rate": 9.521367521367521e-05, "loss": 0.6699, "step": 558 }, { "epoch": 0.286226318484383, "grad_norm": 1.403454065322876, "learning_rate": 9.53846153846154e-05, "loss": 0.6779, "step": 559 }, { "epoch": 0.2867383512544803, "grad_norm": 0.9836353063583374, "learning_rate": 9.555555555555557e-05, "loss": 0.6685, "step": 560 }, { "epoch": 0.2872503840245776, "grad_norm": 0.7977702021598816, "learning_rate": 9.572649572649574e-05, "loss": 0.6619, "step": 561 }, { "epoch": 0.28776241679467485, "grad_norm": 0.6732373833656311, "learning_rate": 9.589743589743591e-05, "loss": 0.6661, "step": 562 }, { "epoch": 0.28827444956477216, "grad_norm": 0.8449483513832092, "learning_rate": 9.606837606837608e-05, "loss": 0.6637, "step": 563 }, { "epoch": 0.2887864823348694, "grad_norm": 0.7036803960800171, "learning_rate": 9.623931623931625e-05, "loss": 0.6569, "step": 564 }, { "epoch": 0.28929851510496674, "grad_norm": 0.5433368682861328, "learning_rate": 9.641025641025641e-05, "loss": 0.6446, "step": 565 }, { "epoch": 0.289810547875064, "grad_norm": 0.5868086814880371, "learning_rate": 9.658119658119658e-05, "loss": 0.6833, "step": 566 }, { "epoch": 0.2903225806451613, "grad_norm": 0.6796988248825073, "learning_rate": 9.675213675213675e-05, "loss": 0.6771, "step": 567 }, { "epoch": 0.29083461341525857, "grad_norm": 0.5650494694709778, "learning_rate": 9.692307692307692e-05, "loss": 0.6665, "step": 568 }, { "epoch": 0.2913466461853559, "grad_norm": 0.6429908871650696, "learning_rate": 9.70940170940171e-05, "loss": 0.6493, "step": 569 }, { "epoch": 0.29185867895545314, "grad_norm": 0.6639748811721802, "learning_rate": 9.726495726495727e-05, "loss": 0.6869, "step": 570 }, { "epoch": 0.29237071172555046, "grad_norm": 0.8330392241477966, "learning_rate": 9.743589743589744e-05, "loss": 0.637, "step": 571 }, { "epoch": 0.2928827444956477, "grad_norm": 1.0730644464492798, "learning_rate": 9.760683760683761e-05, "loss": 0.669, "step": 572 }, { "epoch": 0.29339477726574503, "grad_norm": 0.9497105479240417, "learning_rate": 9.777777777777778e-05, "loss": 0.6574, "step": 573 }, { "epoch": 0.2939068100358423, "grad_norm": 0.6428067088127136, "learning_rate": 9.794871794871795e-05, "loss": 0.6558, "step": 574 }, { "epoch": 0.2944188428059396, "grad_norm": 0.6592909693717957, "learning_rate": 9.811965811965812e-05, "loss": 0.6486, "step": 575 }, { "epoch": 0.29493087557603687, "grad_norm": 0.7068434953689575, "learning_rate": 9.829059829059829e-05, "loss": 0.6682, "step": 576 }, { "epoch": 0.2954429083461341, "grad_norm": 0.7427095174789429, "learning_rate": 9.846153846153848e-05, "loss": 0.674, "step": 577 }, { "epoch": 0.29595494111623144, "grad_norm": 0.5885223150253296, "learning_rate": 9.863247863247864e-05, "loss": 0.6714, "step": 578 }, { "epoch": 0.2964669738863287, "grad_norm": 0.7597149014472961, "learning_rate": 9.88034188034188e-05, "loss": 0.6318, "step": 579 }, { "epoch": 0.296979006656426, "grad_norm": 1.1249017715454102, "learning_rate": 9.897435897435898e-05, "loss": 0.6794, "step": 580 }, { "epoch": 0.2974910394265233, "grad_norm": 1.0035897493362427, "learning_rate": 9.914529914529915e-05, "loss": 0.6495, "step": 581 }, { "epoch": 0.2980030721966206, "grad_norm": 0.912932813167572, "learning_rate": 9.931623931623932e-05, "loss": 0.6567, "step": 582 }, { "epoch": 0.29851510496671785, "grad_norm": 0.7125023007392883, "learning_rate": 9.948717948717949e-05, "loss": 0.6643, "step": 583 }, { "epoch": 0.29902713773681516, "grad_norm": 0.9387484788894653, "learning_rate": 9.965811965811966e-05, "loss": 0.6647, "step": 584 }, { "epoch": 0.2995391705069124, "grad_norm": 1.0707823038101196, "learning_rate": 9.982905982905983e-05, "loss": 0.6289, "step": 585 }, { "epoch": 0.30005120327700974, "grad_norm": 0.8889691829681396, "learning_rate": 0.0001, "loss": 0.679, "step": 586 }, { "epoch": 0.300563236047107, "grad_norm": 1.1190286874771118, "learning_rate": 9.999999112926735e-05, "loss": 0.6679, "step": 587 }, { "epoch": 0.3010752688172043, "grad_norm": 0.9434065818786621, "learning_rate": 9.999996451707256e-05, "loss": 0.6574, "step": 588 }, { "epoch": 0.30158730158730157, "grad_norm": 1.2116340398788452, "learning_rate": 9.999992016342507e-05, "loss": 0.667, "step": 589 }, { "epoch": 0.3020993343573989, "grad_norm": 1.084443211555481, "learning_rate": 9.999985806834063e-05, "loss": 0.6692, "step": 590 }, { "epoch": 0.30261136712749614, "grad_norm": 1.0354690551757812, "learning_rate": 9.999977823184124e-05, "loss": 0.6772, "step": 591 }, { "epoch": 0.30312339989759346, "grad_norm": 0.8059335350990295, "learning_rate": 9.999968065395524e-05, "loss": 0.6508, "step": 592 }, { "epoch": 0.3036354326676907, "grad_norm": 0.7744336128234863, "learning_rate": 9.999956533471726e-05, "loss": 0.6308, "step": 593 }, { "epoch": 0.30414746543778803, "grad_norm": 0.8598417043685913, "learning_rate": 9.999943227416823e-05, "loss": 0.6791, "step": 594 }, { "epoch": 0.3046594982078853, "grad_norm": 1.2314633131027222, "learning_rate": 9.999928147235535e-05, "loss": 0.6631, "step": 595 }, { "epoch": 0.3051715309779826, "grad_norm": 0.9217356443405151, "learning_rate": 9.999911292933215e-05, "loss": 0.663, "step": 596 }, { "epoch": 0.30568356374807987, "grad_norm": 0.8928699493408203, "learning_rate": 9.999892664515839e-05, "loss": 0.6724, "step": 597 }, { "epoch": 0.3061955965181772, "grad_norm": 0.8188864588737488, "learning_rate": 9.999872261990021e-05, "loss": 0.6331, "step": 598 }, { "epoch": 0.30670762928827444, "grad_norm": 1.022771954536438, "learning_rate": 9.999850085362999e-05, "loss": 0.7, "step": 599 }, { "epoch": 0.30721966205837176, "grad_norm": 1.2225421667099, "learning_rate": 9.999826134642641e-05, "loss": 0.6845, "step": 600 }, { "epoch": 0.307731694828469, "grad_norm": 0.6932691335678101, "learning_rate": 9.999800409837446e-05, "loss": 0.6895, "step": 601 }, { "epoch": 0.30824372759856633, "grad_norm": 0.9491382837295532, "learning_rate": 9.999772910956545e-05, "loss": 0.6486, "step": 602 }, { "epoch": 0.3087557603686636, "grad_norm": 0.7604853510856628, "learning_rate": 9.999743638009689e-05, "loss": 0.6407, "step": 603 }, { "epoch": 0.3092677931387609, "grad_norm": 0.7431574463844299, "learning_rate": 9.99971259100727e-05, "loss": 0.6582, "step": 604 }, { "epoch": 0.30977982590885816, "grad_norm": 1.0924266576766968, "learning_rate": 9.999679769960302e-05, "loss": 0.6977, "step": 605 }, { "epoch": 0.3102918586789555, "grad_norm": 0.8012250065803528, "learning_rate": 9.999645174880434e-05, "loss": 0.6558, "step": 606 }, { "epoch": 0.31080389144905274, "grad_norm": 0.7855070233345032, "learning_rate": 9.999608805779937e-05, "loss": 0.6673, "step": 607 }, { "epoch": 0.31131592421915005, "grad_norm": 0.820652961730957, "learning_rate": 9.999570662671718e-05, "loss": 0.6737, "step": 608 }, { "epoch": 0.3118279569892473, "grad_norm": 0.7835376262664795, "learning_rate": 9.999530745569313e-05, "loss": 0.6534, "step": 609 }, { "epoch": 0.31233998975934457, "grad_norm": 0.646430492401123, "learning_rate": 9.999489054486881e-05, "loss": 0.6564, "step": 610 }, { "epoch": 0.3128520225294419, "grad_norm": 0.699636697769165, "learning_rate": 9.99944558943922e-05, "loss": 0.6553, "step": 611 }, { "epoch": 0.31336405529953915, "grad_norm": 0.7669063806533813, "learning_rate": 9.999400350441752e-05, "loss": 0.6534, "step": 612 }, { "epoch": 0.31387608806963646, "grad_norm": 0.6899175047874451, "learning_rate": 9.999353337510526e-05, "loss": 0.6591, "step": 613 }, { "epoch": 0.3143881208397337, "grad_norm": 0.5625957250595093, "learning_rate": 9.999304550662228e-05, "loss": 0.6642, "step": 614 }, { "epoch": 0.31490015360983103, "grad_norm": 0.7017382979393005, "learning_rate": 9.999253989914163e-05, "loss": 0.6634, "step": 615 }, { "epoch": 0.3154121863799283, "grad_norm": 0.5705029964447021, "learning_rate": 9.999201655284278e-05, "loss": 0.668, "step": 616 }, { "epoch": 0.3159242191500256, "grad_norm": 0.5717861652374268, "learning_rate": 9.999147546791139e-05, "loss": 0.6457, "step": 617 }, { "epoch": 0.31643625192012287, "grad_norm": 0.715431272983551, "learning_rate": 9.999091664453946e-05, "loss": 0.6782, "step": 618 }, { "epoch": 0.3169482846902202, "grad_norm": 0.6624259352684021, "learning_rate": 9.999034008292528e-05, "loss": 0.65, "step": 619 }, { "epoch": 0.31746031746031744, "grad_norm": 0.7460405230522156, "learning_rate": 9.998974578327343e-05, "loss": 0.6617, "step": 620 }, { "epoch": 0.31797235023041476, "grad_norm": 0.699622392654419, "learning_rate": 9.998913374579477e-05, "loss": 0.659, "step": 621 }, { "epoch": 0.318484383000512, "grad_norm": 0.5766032934188843, "learning_rate": 9.99885039707065e-05, "loss": 0.687, "step": 622 }, { "epoch": 0.31899641577060933, "grad_norm": 0.7408196926116943, "learning_rate": 9.998785645823206e-05, "loss": 0.6523, "step": 623 }, { "epoch": 0.3195084485407066, "grad_norm": 0.5750259757041931, "learning_rate": 9.99871912086012e-05, "loss": 0.6678, "step": 624 }, { "epoch": 0.3200204813108039, "grad_norm": 0.6813869476318359, "learning_rate": 9.998650822205e-05, "loss": 0.6629, "step": 625 }, { "epoch": 0.32053251408090117, "grad_norm": 0.5667946338653564, "learning_rate": 9.998580749882077e-05, "loss": 0.6674, "step": 626 }, { "epoch": 0.3210445468509985, "grad_norm": 0.5587700009346008, "learning_rate": 9.998508903916218e-05, "loss": 0.6483, "step": 627 }, { "epoch": 0.32155657962109574, "grad_norm": 0.6187735199928284, "learning_rate": 9.998435284332912e-05, "loss": 0.6726, "step": 628 }, { "epoch": 0.32206861239119305, "grad_norm": 0.5866286754608154, "learning_rate": 9.998359891158287e-05, "loss": 0.6706, "step": 629 }, { "epoch": 0.3225806451612903, "grad_norm": 0.6426648497581482, "learning_rate": 9.998282724419088e-05, "loss": 0.6428, "step": 630 }, { "epoch": 0.32309267793138763, "grad_norm": 0.560668408870697, "learning_rate": 9.9982037841427e-05, "loss": 0.6502, "step": 631 }, { "epoch": 0.3236047107014849, "grad_norm": 0.4687606394290924, "learning_rate": 9.998123070357135e-05, "loss": 0.6625, "step": 632 }, { "epoch": 0.3241167434715822, "grad_norm": 0.5516302585601807, "learning_rate": 9.99804058309103e-05, "loss": 0.6893, "step": 633 }, { "epoch": 0.32462877624167946, "grad_norm": 0.6200506091117859, "learning_rate": 9.997956322373654e-05, "loss": 0.659, "step": 634 }, { "epoch": 0.3251408090117768, "grad_norm": 0.5519453883171082, "learning_rate": 9.997870288234904e-05, "loss": 0.6496, "step": 635 }, { "epoch": 0.32565284178187404, "grad_norm": 0.6353191137313843, "learning_rate": 9.99778248070531e-05, "loss": 0.6471, "step": 636 }, { "epoch": 0.32616487455197135, "grad_norm": 0.6826664805412292, "learning_rate": 9.997692899816027e-05, "loss": 0.6692, "step": 637 }, { "epoch": 0.3266769073220686, "grad_norm": 0.6646647453308105, "learning_rate": 9.997601545598842e-05, "loss": 0.656, "step": 638 }, { "epoch": 0.3271889400921659, "grad_norm": 0.6459507942199707, "learning_rate": 9.99750841808617e-05, "loss": 0.6451, "step": 639 }, { "epoch": 0.3277009728622632, "grad_norm": 0.6274622082710266, "learning_rate": 9.997413517311055e-05, "loss": 0.6355, "step": 640 }, { "epoch": 0.32821300563236044, "grad_norm": 0.6396896243095398, "learning_rate": 9.997316843307169e-05, "loss": 0.6797, "step": 641 }, { "epoch": 0.32872503840245776, "grad_norm": 0.7291072010993958, "learning_rate": 9.997218396108818e-05, "loss": 0.6461, "step": 642 }, { "epoch": 0.329237071172555, "grad_norm": 1.0755374431610107, "learning_rate": 9.997118175750932e-05, "loss": 0.6481, "step": 643 }, { "epoch": 0.32974910394265233, "grad_norm": 1.1065418720245361, "learning_rate": 9.997016182269074e-05, "loss": 0.6925, "step": 644 }, { "epoch": 0.3302611367127496, "grad_norm": 1.13067626953125, "learning_rate": 9.996912415699431e-05, "loss": 0.6546, "step": 645 }, { "epoch": 0.3307731694828469, "grad_norm": 0.8079965114593506, "learning_rate": 9.996806876078823e-05, "loss": 0.6648, "step": 646 }, { "epoch": 0.33128520225294417, "grad_norm": 0.8003170490264893, "learning_rate": 9.996699563444701e-05, "loss": 0.6345, "step": 647 }, { "epoch": 0.3317972350230415, "grad_norm": 1.225967526435852, "learning_rate": 9.996590477835141e-05, "loss": 0.6643, "step": 648 }, { "epoch": 0.33230926779313874, "grad_norm": 0.8137050271034241, "learning_rate": 9.996479619288852e-05, "loss": 0.671, "step": 649 }, { "epoch": 0.33282130056323606, "grad_norm": 0.753202497959137, "learning_rate": 9.996366987845166e-05, "loss": 0.6991, "step": 650 }, { "epoch": 0.3333333333333333, "grad_norm": 0.5455398559570312, "learning_rate": 9.99625258354405e-05, "loss": 0.6601, "step": 651 }, { "epoch": 0.33384536610343063, "grad_norm": 0.6738185882568359, "learning_rate": 9.996136406426098e-05, "loss": 0.6791, "step": 652 }, { "epoch": 0.3343573988735279, "grad_norm": 0.6913399696350098, "learning_rate": 9.996018456532534e-05, "loss": 0.6709, "step": 653 }, { "epoch": 0.3348694316436252, "grad_norm": 0.8063302636146545, "learning_rate": 9.995898733905208e-05, "loss": 0.6611, "step": 654 }, { "epoch": 0.33538146441372246, "grad_norm": 1.039247989654541, "learning_rate": 9.995777238586603e-05, "loss": 0.6591, "step": 655 }, { "epoch": 0.3358934971838198, "grad_norm": 1.1081150770187378, "learning_rate": 9.995653970619827e-05, "loss": 0.6659, "step": 656 }, { "epoch": 0.33640552995391704, "grad_norm": 0.9715071320533752, "learning_rate": 9.995528930048621e-05, "loss": 0.6692, "step": 657 }, { "epoch": 0.33691756272401435, "grad_norm": 0.7164512872695923, "learning_rate": 9.995402116917352e-05, "loss": 0.6549, "step": 658 }, { "epoch": 0.3374295954941116, "grad_norm": 0.5304288268089294, "learning_rate": 9.995273531271018e-05, "loss": 0.6304, "step": 659 }, { "epoch": 0.3379416282642089, "grad_norm": 0.6173895001411438, "learning_rate": 9.995143173155246e-05, "loss": 0.663, "step": 660 }, { "epoch": 0.3384536610343062, "grad_norm": 0.5473006367683411, "learning_rate": 9.995011042616287e-05, "loss": 0.6921, "step": 661 }, { "epoch": 0.3389656938044035, "grad_norm": 0.593630850315094, "learning_rate": 9.994877139701026e-05, "loss": 0.6638, "step": 662 }, { "epoch": 0.33947772657450076, "grad_norm": 0.6406407952308655, "learning_rate": 9.994741464456979e-05, "loss": 0.6547, "step": 663 }, { "epoch": 0.3399897593445981, "grad_norm": 0.65459805727005, "learning_rate": 9.994604016932285e-05, "loss": 0.6405, "step": 664 }, { "epoch": 0.34050179211469533, "grad_norm": 0.8077269196510315, "learning_rate": 9.994464797175713e-05, "loss": 0.6387, "step": 665 }, { "epoch": 0.34101382488479265, "grad_norm": 0.8140881061553955, "learning_rate": 9.994323805236667e-05, "loss": 0.6442, "step": 666 }, { "epoch": 0.3415258576548899, "grad_norm": 0.6722447872161865, "learning_rate": 9.99418104116517e-05, "loss": 0.643, "step": 667 }, { "epoch": 0.3420378904249872, "grad_norm": 0.6759157180786133, "learning_rate": 9.99403650501188e-05, "loss": 0.6586, "step": 668 }, { "epoch": 0.3425499231950845, "grad_norm": 0.511278510093689, "learning_rate": 9.993890196828084e-05, "loss": 0.6625, "step": 669 }, { "epoch": 0.3430619559651818, "grad_norm": 0.7199527025222778, "learning_rate": 9.993742116665697e-05, "loss": 0.6524, "step": 670 }, { "epoch": 0.34357398873527906, "grad_norm": 0.7448251247406006, "learning_rate": 9.99359226457726e-05, "loss": 0.6846, "step": 671 }, { "epoch": 0.34408602150537637, "grad_norm": 0.7884711623191833, "learning_rate": 9.993440640615948e-05, "loss": 0.6676, "step": 672 }, { "epoch": 0.34459805427547363, "grad_norm": 0.7646994590759277, "learning_rate": 9.993287244835559e-05, "loss": 0.6646, "step": 673 }, { "epoch": 0.3451100870455709, "grad_norm": 0.6594809293746948, "learning_rate": 9.993132077290522e-05, "loss": 0.6488, "step": 674 }, { "epoch": 0.3456221198156682, "grad_norm": 0.5072150230407715, "learning_rate": 9.992975138035896e-05, "loss": 0.673, "step": 675 }, { "epoch": 0.34613415258576546, "grad_norm": 0.8965052366256714, "learning_rate": 9.992816427127368e-05, "loss": 0.6685, "step": 676 }, { "epoch": 0.3466461853558628, "grad_norm": 0.9453760981559753, "learning_rate": 9.992655944621252e-05, "loss": 0.6893, "step": 677 }, { "epoch": 0.34715821812596004, "grad_norm": 0.8601169586181641, "learning_rate": 9.992493690574494e-05, "loss": 0.6715, "step": 678 }, { "epoch": 0.34767025089605735, "grad_norm": 0.9070627689361572, "learning_rate": 9.992329665044666e-05, "loss": 0.6644, "step": 679 }, { "epoch": 0.3481822836661546, "grad_norm": 0.7975482940673828, "learning_rate": 9.992163868089968e-05, "loss": 0.663, "step": 680 }, { "epoch": 0.3486943164362519, "grad_norm": 0.9365615248680115, "learning_rate": 9.991996299769228e-05, "loss": 0.7023, "step": 681 }, { "epoch": 0.3492063492063492, "grad_norm": 1.0341823101043701, "learning_rate": 9.991826960141909e-05, "loss": 0.6447, "step": 682 }, { "epoch": 0.3497183819764465, "grad_norm": 0.8796820640563965, "learning_rate": 9.991655849268092e-05, "loss": 0.6651, "step": 683 }, { "epoch": 0.35023041474654376, "grad_norm": 0.9282682538032532, "learning_rate": 9.991482967208496e-05, "loss": 0.6572, "step": 684 }, { "epoch": 0.3507424475166411, "grad_norm": 1.250457525253296, "learning_rate": 9.991308314024465e-05, "loss": 0.6718, "step": 685 }, { "epoch": 0.35125448028673834, "grad_norm": 0.892915666103363, "learning_rate": 9.991131889777968e-05, "loss": 0.6502, "step": 686 }, { "epoch": 0.35176651305683565, "grad_norm": 0.7889409065246582, "learning_rate": 9.990953694531609e-05, "loss": 0.6804, "step": 687 }, { "epoch": 0.3522785458269329, "grad_norm": 1.1725577116012573, "learning_rate": 9.990773728348613e-05, "loss": 0.6718, "step": 688 }, { "epoch": 0.3527905785970302, "grad_norm": 0.9109938144683838, "learning_rate": 9.99059199129284e-05, "loss": 0.6566, "step": 689 }, { "epoch": 0.3533026113671275, "grad_norm": 0.8002787232398987, "learning_rate": 9.990408483428774e-05, "loss": 0.6496, "step": 690 }, { "epoch": 0.3538146441372248, "grad_norm": 0.7512291669845581, "learning_rate": 9.990223204821531e-05, "loss": 0.652, "step": 691 }, { "epoch": 0.35432667690732206, "grad_norm": 1.0249329805374146, "learning_rate": 9.990036155536853e-05, "loss": 0.6557, "step": 692 }, { "epoch": 0.3548387096774194, "grad_norm": 0.7564594745635986, "learning_rate": 9.989847335641109e-05, "loss": 0.6509, "step": 693 }, { "epoch": 0.35535074244751663, "grad_norm": 0.5170691013336182, "learning_rate": 9.989656745201298e-05, "loss": 0.6517, "step": 694 }, { "epoch": 0.35586277521761395, "grad_norm": 1.0099879503250122, "learning_rate": 9.989464384285048e-05, "loss": 0.653, "step": 695 }, { "epoch": 0.3563748079877112, "grad_norm": 1.116167426109314, "learning_rate": 9.989270252960614e-05, "loss": 0.6794, "step": 696 }, { "epoch": 0.3568868407578085, "grad_norm": 0.6873573660850525, "learning_rate": 9.989074351296881e-05, "loss": 0.6443, "step": 697 }, { "epoch": 0.3573988735279058, "grad_norm": 0.6107996702194214, "learning_rate": 9.988876679363357e-05, "loss": 0.6936, "step": 698 }, { "epoch": 0.3579109062980031, "grad_norm": 1.1279820203781128, "learning_rate": 9.988677237230184e-05, "loss": 0.6402, "step": 699 }, { "epoch": 0.35842293906810035, "grad_norm": 0.6745296120643616, "learning_rate": 9.98847602496813e-05, "loss": 0.6483, "step": 700 }, { "epoch": 0.35893497183819767, "grad_norm": 1.1177527904510498, "learning_rate": 9.988273042648593e-05, "loss": 0.6734, "step": 701 }, { "epoch": 0.35944700460829493, "grad_norm": 1.074716567993164, "learning_rate": 9.988068290343592e-05, "loss": 0.6361, "step": 702 }, { "epoch": 0.35995903737839224, "grad_norm": 0.5518096089363098, "learning_rate": 9.987861768125783e-05, "loss": 0.6484, "step": 703 }, { "epoch": 0.3604710701484895, "grad_norm": 0.9884310364723206, "learning_rate": 9.987653476068445e-05, "loss": 0.6581, "step": 704 }, { "epoch": 0.36098310291858676, "grad_norm": 0.952630341053009, "learning_rate": 9.987443414245487e-05, "loss": 0.6396, "step": 705 }, { "epoch": 0.3614951356886841, "grad_norm": 0.6329352259635925, "learning_rate": 9.987231582731444e-05, "loss": 0.6729, "step": 706 }, { "epoch": 0.36200716845878134, "grad_norm": 1.20749032497406, "learning_rate": 9.98701798160148e-05, "loss": 0.6778, "step": 707 }, { "epoch": 0.36251920122887865, "grad_norm": 1.0794720649719238, "learning_rate": 9.986802610931388e-05, "loss": 0.6545, "step": 708 }, { "epoch": 0.3630312339989759, "grad_norm": 0.9728733897209167, "learning_rate": 9.986585470797586e-05, "loss": 0.6583, "step": 709 }, { "epoch": 0.3635432667690732, "grad_norm": 0.716338038444519, "learning_rate": 9.986366561277123e-05, "loss": 0.6418, "step": 710 }, { "epoch": 0.3640552995391705, "grad_norm": 0.5710652470588684, "learning_rate": 9.986145882447676e-05, "loss": 0.6407, "step": 711 }, { "epoch": 0.3645673323092678, "grad_norm": 0.9990123510360718, "learning_rate": 9.985923434387545e-05, "loss": 0.647, "step": 712 }, { "epoch": 0.36507936507936506, "grad_norm": 1.148140788078308, "learning_rate": 9.985699217175663e-05, "loss": 0.6641, "step": 713 }, { "epoch": 0.3655913978494624, "grad_norm": 0.7677567005157471, "learning_rate": 9.985473230891588e-05, "loss": 0.6652, "step": 714 }, { "epoch": 0.36610343061955963, "grad_norm": 0.8755362629890442, "learning_rate": 9.985245475615509e-05, "loss": 0.6475, "step": 715 }, { "epoch": 0.36661546338965695, "grad_norm": 0.7342329621315002, "learning_rate": 9.985015951428237e-05, "loss": 0.644, "step": 716 }, { "epoch": 0.3671274961597542, "grad_norm": 0.9522359371185303, "learning_rate": 9.984784658411216e-05, "loss": 0.658, "step": 717 }, { "epoch": 0.3676395289298515, "grad_norm": 1.3427050113677979, "learning_rate": 9.984551596646514e-05, "loss": 0.6315, "step": 718 }, { "epoch": 0.3681515616999488, "grad_norm": 0.8320227861404419, "learning_rate": 9.98431676621683e-05, "loss": 0.6454, "step": 719 }, { "epoch": 0.3686635944700461, "grad_norm": 1.0953171253204346, "learning_rate": 9.984080167205488e-05, "loss": 0.6517, "step": 720 }, { "epoch": 0.36917562724014336, "grad_norm": 1.4138046503067017, "learning_rate": 9.983841799696438e-05, "loss": 0.6263, "step": 721 }, { "epoch": 0.36968766001024067, "grad_norm": 0.7983854413032532, "learning_rate": 9.983601663774264e-05, "loss": 0.6847, "step": 722 }, { "epoch": 0.37019969278033793, "grad_norm": 0.7608910799026489, "learning_rate": 9.983359759524169e-05, "loss": 0.6611, "step": 723 }, { "epoch": 0.37071172555043524, "grad_norm": 1.1019700765609741, "learning_rate": 9.983116087031991e-05, "loss": 0.6521, "step": 724 }, { "epoch": 0.3712237583205325, "grad_norm": 0.6490877866744995, "learning_rate": 9.98287064638419e-05, "loss": 0.6582, "step": 725 }, { "epoch": 0.3717357910906298, "grad_norm": 0.7151639461517334, "learning_rate": 9.982623437667858e-05, "loss": 0.6514, "step": 726 }, { "epoch": 0.3722478238607271, "grad_norm": 0.9253897070884705, "learning_rate": 9.982374460970709e-05, "loss": 0.6764, "step": 727 }, { "epoch": 0.3727598566308244, "grad_norm": 0.9288658499717712, "learning_rate": 9.98212371638109e-05, "loss": 0.6355, "step": 728 }, { "epoch": 0.37327188940092165, "grad_norm": 0.8777347803115845, "learning_rate": 9.98187120398797e-05, "loss": 0.6371, "step": 729 }, { "epoch": 0.37378392217101897, "grad_norm": 0.5686779618263245, "learning_rate": 9.981616923880947e-05, "loss": 0.6255, "step": 730 }, { "epoch": 0.3742959549411162, "grad_norm": 0.8228483200073242, "learning_rate": 9.981360876150251e-05, "loss": 0.6481, "step": 731 }, { "epoch": 0.37480798771121354, "grad_norm": 0.8743718862533569, "learning_rate": 9.981103060886731e-05, "loss": 0.6511, "step": 732 }, { "epoch": 0.3753200204813108, "grad_norm": 0.7460940480232239, "learning_rate": 9.980843478181873e-05, "loss": 0.6637, "step": 733 }, { "epoch": 0.3758320532514081, "grad_norm": 0.5396794080734253, "learning_rate": 9.980582128127779e-05, "loss": 0.6467, "step": 734 }, { "epoch": 0.3763440860215054, "grad_norm": 0.46531859040260315, "learning_rate": 9.980319010817186e-05, "loss": 0.6306, "step": 735 }, { "epoch": 0.3768561187916027, "grad_norm": 0.563606858253479, "learning_rate": 9.980054126343456e-05, "loss": 0.6365, "step": 736 }, { "epoch": 0.37736815156169995, "grad_norm": 0.6869078874588013, "learning_rate": 9.979787474800577e-05, "loss": 0.6433, "step": 737 }, { "epoch": 0.3778801843317972, "grad_norm": 0.6015436053276062, "learning_rate": 9.979519056283164e-05, "loss": 0.6053, "step": 738 }, { "epoch": 0.3783922171018945, "grad_norm": 0.6649131178855896, "learning_rate": 9.979248870886462e-05, "loss": 0.6477, "step": 739 }, { "epoch": 0.3789042498719918, "grad_norm": 0.549165666103363, "learning_rate": 9.978976918706338e-05, "loss": 0.6496, "step": 740 }, { "epoch": 0.3794162826420891, "grad_norm": 0.5689040422439575, "learning_rate": 9.978703199839292e-05, "loss": 0.6672, "step": 741 }, { "epoch": 0.37992831541218636, "grad_norm": 0.7422501444816589, "learning_rate": 9.978427714382446e-05, "loss": 0.6608, "step": 742 }, { "epoch": 0.38044034818228367, "grad_norm": 0.7606332898139954, "learning_rate": 9.978150462433548e-05, "loss": 0.6827, "step": 743 }, { "epoch": 0.38095238095238093, "grad_norm": 0.9328123331069946, "learning_rate": 9.977871444090977e-05, "loss": 0.6698, "step": 744 }, { "epoch": 0.38146441372247825, "grad_norm": 1.0486763715744019, "learning_rate": 9.977590659453737e-05, "loss": 0.6542, "step": 745 }, { "epoch": 0.3819764464925755, "grad_norm": 1.2935494184494019, "learning_rate": 9.977308108621459e-05, "loss": 0.6455, "step": 746 }, { "epoch": 0.3824884792626728, "grad_norm": 0.6969727873802185, "learning_rate": 9.9770237916944e-05, "loss": 0.6578, "step": 747 }, { "epoch": 0.3830005120327701, "grad_norm": 1.0604690313339233, "learning_rate": 9.976737708773445e-05, "loss": 0.6548, "step": 748 }, { "epoch": 0.3835125448028674, "grad_norm": 1.2397388219833374, "learning_rate": 9.976449859960102e-05, "loss": 0.6292, "step": 749 }, { "epoch": 0.38402457757296465, "grad_norm": 0.762373685836792, "learning_rate": 9.97616024535651e-05, "loss": 0.6557, "step": 750 }, { "epoch": 0.38453661034306197, "grad_norm": 1.0155538320541382, "learning_rate": 9.975868865065431e-05, "loss": 0.651, "step": 751 }, { "epoch": 0.38504864311315923, "grad_norm": 0.9844079613685608, "learning_rate": 9.975575719190257e-05, "loss": 0.659, "step": 752 }, { "epoch": 0.38556067588325654, "grad_norm": 0.6792572736740112, "learning_rate": 9.975280807835006e-05, "loss": 0.6476, "step": 753 }, { "epoch": 0.3860727086533538, "grad_norm": 0.8088528513908386, "learning_rate": 9.974984131104318e-05, "loss": 0.6484, "step": 754 }, { "epoch": 0.3865847414234511, "grad_norm": 1.0036265850067139, "learning_rate": 9.974685689103464e-05, "loss": 0.6447, "step": 755 }, { "epoch": 0.3870967741935484, "grad_norm": 0.8075035214424133, "learning_rate": 9.97438548193834e-05, "loss": 0.6337, "step": 756 }, { "epoch": 0.3876088069636457, "grad_norm": 0.5410921573638916, "learning_rate": 9.974083509715471e-05, "loss": 0.6787, "step": 757 }, { "epoch": 0.38812083973374295, "grad_norm": 1.2285412549972534, "learning_rate": 9.973779772542e-05, "loss": 0.6765, "step": 758 }, { "epoch": 0.38863287250384027, "grad_norm": 0.8822839260101318, "learning_rate": 9.973474270525707e-05, "loss": 0.6475, "step": 759 }, { "epoch": 0.3891449052739375, "grad_norm": 0.581229567527771, "learning_rate": 9.973167003774989e-05, "loss": 0.6473, "step": 760 }, { "epoch": 0.38965693804403484, "grad_norm": 0.5646109580993652, "learning_rate": 9.972857972398876e-05, "loss": 0.6267, "step": 761 }, { "epoch": 0.3901689708141321, "grad_norm": 0.7561930418014526, "learning_rate": 9.972547176507022e-05, "loss": 0.6783, "step": 762 }, { "epoch": 0.3906810035842294, "grad_norm": 0.6261668801307678, "learning_rate": 9.972234616209704e-05, "loss": 0.6233, "step": 763 }, { "epoch": 0.3911930363543267, "grad_norm": 0.69068843126297, "learning_rate": 9.971920291617827e-05, "loss": 0.7182, "step": 764 }, { "epoch": 0.391705069124424, "grad_norm": 0.6838326454162598, "learning_rate": 9.971604202842926e-05, "loss": 0.6329, "step": 765 }, { "epoch": 0.39221710189452125, "grad_norm": 0.6681329607963562, "learning_rate": 9.971286349997156e-05, "loss": 0.6564, "step": 766 }, { "epoch": 0.39272913466461856, "grad_norm": 0.5092385411262512, "learning_rate": 9.970966733193302e-05, "loss": 0.6333, "step": 767 }, { "epoch": 0.3932411674347158, "grad_norm": 0.4963456094264984, "learning_rate": 9.970645352544772e-05, "loss": 0.6361, "step": 768 }, { "epoch": 0.3937532002048131, "grad_norm": 0.6212189793586731, "learning_rate": 9.970322208165601e-05, "loss": 0.6338, "step": 769 }, { "epoch": 0.3942652329749104, "grad_norm": 0.6702895760536194, "learning_rate": 9.969997300170453e-05, "loss": 0.6483, "step": 770 }, { "epoch": 0.39477726574500765, "grad_norm": 0.5002304911613464, "learning_rate": 9.969670628674612e-05, "loss": 0.6231, "step": 771 }, { "epoch": 0.39528929851510497, "grad_norm": 0.49956297874450684, "learning_rate": 9.969342193793992e-05, "loss": 0.6293, "step": 772 }, { "epoch": 0.39580133128520223, "grad_norm": 0.578999400138855, "learning_rate": 9.96901199564513e-05, "loss": 0.6881, "step": 773 }, { "epoch": 0.39631336405529954, "grad_norm": 0.792938232421875, "learning_rate": 9.96868003434519e-05, "loss": 0.6666, "step": 774 }, { "epoch": 0.3968253968253968, "grad_norm": 0.7070488333702087, "learning_rate": 9.968346310011964e-05, "loss": 0.6482, "step": 775 }, { "epoch": 0.3973374295954941, "grad_norm": 0.8681166768074036, "learning_rate": 9.968010822763866e-05, "loss": 0.6446, "step": 776 }, { "epoch": 0.3978494623655914, "grad_norm": 0.7346497774124146, "learning_rate": 9.967673572719935e-05, "loss": 0.6301, "step": 777 }, { "epoch": 0.3983614951356887, "grad_norm": 0.8471706509590149, "learning_rate": 9.967334559999839e-05, "loss": 0.6538, "step": 778 }, { "epoch": 0.39887352790578595, "grad_norm": 0.839423418045044, "learning_rate": 9.96699378472387e-05, "loss": 0.6163, "step": 779 }, { "epoch": 0.39938556067588327, "grad_norm": 0.5263079404830933, "learning_rate": 9.966651247012942e-05, "loss": 0.6654, "step": 780 }, { "epoch": 0.3998975934459805, "grad_norm": 0.5272893309593201, "learning_rate": 9.966306946988602e-05, "loss": 0.6434, "step": 781 }, { "epoch": 0.40040962621607784, "grad_norm": 0.7983332276344299, "learning_rate": 9.965960884773013e-05, "loss": 0.6728, "step": 782 }, { "epoch": 0.4009216589861751, "grad_norm": 0.8893037438392639, "learning_rate": 9.965613060488972e-05, "loss": 0.6324, "step": 783 }, { "epoch": 0.4014336917562724, "grad_norm": 1.0042239427566528, "learning_rate": 9.965263474259896e-05, "loss": 0.6791, "step": 784 }, { "epoch": 0.4019457245263697, "grad_norm": 0.9618741869926453, "learning_rate": 9.964912126209827e-05, "loss": 0.6514, "step": 785 }, { "epoch": 0.402457757296467, "grad_norm": 0.8845220804214478, "learning_rate": 9.964559016463435e-05, "loss": 0.6561, "step": 786 }, { "epoch": 0.40296979006656425, "grad_norm": 0.8058607578277588, "learning_rate": 9.964204145146015e-05, "loss": 0.6418, "step": 787 }, { "epoch": 0.40348182283666156, "grad_norm": 0.7716985940933228, "learning_rate": 9.963847512383482e-05, "loss": 0.6304, "step": 788 }, { "epoch": 0.4039938556067588, "grad_norm": 0.7384299635887146, "learning_rate": 9.963489118302381e-05, "loss": 0.6453, "step": 789 }, { "epoch": 0.40450588837685614, "grad_norm": 0.6477707624435425, "learning_rate": 9.963128963029884e-05, "loss": 0.6363, "step": 790 }, { "epoch": 0.4050179211469534, "grad_norm": 0.7664931416511536, "learning_rate": 9.96276704669378e-05, "loss": 0.6474, "step": 791 }, { "epoch": 0.4055299539170507, "grad_norm": 0.9302617907524109, "learning_rate": 9.962403369422491e-05, "loss": 0.6643, "step": 792 }, { "epoch": 0.40604198668714797, "grad_norm": 0.8148412108421326, "learning_rate": 9.962037931345059e-05, "loss": 0.6684, "step": 793 }, { "epoch": 0.4065540194572453, "grad_norm": 0.7309584021568298, "learning_rate": 9.961670732591152e-05, "loss": 0.6451, "step": 794 }, { "epoch": 0.40706605222734255, "grad_norm": 1.6716175079345703, "learning_rate": 9.961301773291064e-05, "loss": 0.6428, "step": 795 }, { "epoch": 0.40757808499743986, "grad_norm": 1.1584049463272095, "learning_rate": 9.96093105357571e-05, "loss": 0.6381, "step": 796 }, { "epoch": 0.4080901177675371, "grad_norm": 0.859271764755249, "learning_rate": 9.960558573576635e-05, "loss": 0.6524, "step": 797 }, { "epoch": 0.40860215053763443, "grad_norm": 1.0106620788574219, "learning_rate": 9.960184333426003e-05, "loss": 0.6715, "step": 798 }, { "epoch": 0.4091141833077317, "grad_norm": 0.9150800108909607, "learning_rate": 9.959808333256611e-05, "loss": 0.6668, "step": 799 }, { "epoch": 0.409626216077829, "grad_norm": 0.8434951901435852, "learning_rate": 9.959430573201866e-05, "loss": 0.6518, "step": 800 }, { "epoch": 0.41013824884792627, "grad_norm": 0.7799317240715027, "learning_rate": 9.959051053395817e-05, "loss": 0.6583, "step": 801 }, { "epoch": 0.4106502816180235, "grad_norm": 0.6620454788208008, "learning_rate": 9.958669773973123e-05, "loss": 0.6449, "step": 802 }, { "epoch": 0.41116231438812084, "grad_norm": 0.8226097822189331, "learning_rate": 9.958286735069078e-05, "loss": 0.6462, "step": 803 }, { "epoch": 0.4116743471582181, "grad_norm": 1.1388323307037354, "learning_rate": 9.957901936819589e-05, "loss": 0.6682, "step": 804 }, { "epoch": 0.4121863799283154, "grad_norm": 0.6155979037284851, "learning_rate": 9.9575153793612e-05, "loss": 0.6982, "step": 805 }, { "epoch": 0.4126984126984127, "grad_norm": 0.8597806692123413, "learning_rate": 9.957127062831068e-05, "loss": 0.6292, "step": 806 }, { "epoch": 0.41321044546851, "grad_norm": 1.0912470817565918, "learning_rate": 9.956736987366984e-05, "loss": 0.6614, "step": 807 }, { "epoch": 0.41372247823860725, "grad_norm": 0.9340446591377258, "learning_rate": 9.956345153107353e-05, "loss": 0.6622, "step": 808 }, { "epoch": 0.41423451100870456, "grad_norm": 0.7889856100082397, "learning_rate": 9.955951560191212e-05, "loss": 0.6651, "step": 809 }, { "epoch": 0.4147465437788018, "grad_norm": 0.7375758290290833, "learning_rate": 9.95555620875822e-05, "loss": 0.6644, "step": 810 }, { "epoch": 0.41525857654889914, "grad_norm": 0.8767827749252319, "learning_rate": 9.955159098948659e-05, "loss": 0.6609, "step": 811 }, { "epoch": 0.4157706093189964, "grad_norm": 0.8841429948806763, "learning_rate": 9.954760230903433e-05, "loss": 0.6649, "step": 812 }, { "epoch": 0.4162826420890937, "grad_norm": 0.8340454697608948, "learning_rate": 9.954359604764075e-05, "loss": 0.6587, "step": 813 }, { "epoch": 0.416794674859191, "grad_norm": 0.6618418097496033, "learning_rate": 9.953957220672736e-05, "loss": 0.6369, "step": 814 }, { "epoch": 0.4173067076292883, "grad_norm": 0.8620227575302124, "learning_rate": 9.953553078772195e-05, "loss": 0.6321, "step": 815 }, { "epoch": 0.41781874039938555, "grad_norm": 0.926149308681488, "learning_rate": 9.953147179205855e-05, "loss": 0.6588, "step": 816 }, { "epoch": 0.41833077316948286, "grad_norm": 0.5511178970336914, "learning_rate": 9.952739522117738e-05, "loss": 0.6575, "step": 817 }, { "epoch": 0.4188428059395801, "grad_norm": 0.7440958619117737, "learning_rate": 9.952330107652496e-05, "loss": 0.6466, "step": 818 }, { "epoch": 0.41935483870967744, "grad_norm": 0.777613639831543, "learning_rate": 9.9519189359554e-05, "loss": 0.6479, "step": 819 }, { "epoch": 0.4198668714797747, "grad_norm": 0.5644140839576721, "learning_rate": 9.951506007172343e-05, "loss": 0.6375, "step": 820 }, { "epoch": 0.420378904249872, "grad_norm": 0.37277287244796753, "learning_rate": 9.951091321449847e-05, "loss": 0.6518, "step": 821 }, { "epoch": 0.42089093701996927, "grad_norm": 0.6745261549949646, "learning_rate": 9.950674878935056e-05, "loss": 0.6573, "step": 822 }, { "epoch": 0.4214029697900666, "grad_norm": 0.8382142186164856, "learning_rate": 9.950256679775734e-05, "loss": 0.6637, "step": 823 }, { "epoch": 0.42191500256016384, "grad_norm": 0.8412274122238159, "learning_rate": 9.94983672412027e-05, "loss": 0.6509, "step": 824 }, { "epoch": 0.42242703533026116, "grad_norm": 0.6410328149795532, "learning_rate": 9.949415012117675e-05, "loss": 0.6449, "step": 825 }, { "epoch": 0.4229390681003584, "grad_norm": 0.9945329427719116, "learning_rate": 9.94899154391759e-05, "loss": 0.6569, "step": 826 }, { "epoch": 0.42345110087045573, "grad_norm": 1.0910792350769043, "learning_rate": 9.948566319670269e-05, "loss": 0.6342, "step": 827 }, { "epoch": 0.423963133640553, "grad_norm": 0.9393929839134216, "learning_rate": 9.948139339526597e-05, "loss": 0.6488, "step": 828 }, { "epoch": 0.4244751664106503, "grad_norm": 0.7850554585456848, "learning_rate": 9.947710603638078e-05, "loss": 0.6404, "step": 829 }, { "epoch": 0.42498719918074757, "grad_norm": 1.2192896604537964, "learning_rate": 9.947280112156839e-05, "loss": 0.673, "step": 830 }, { "epoch": 0.4254992319508449, "grad_norm": 0.5772989988327026, "learning_rate": 9.946847865235631e-05, "loss": 0.6369, "step": 831 }, { "epoch": 0.42601126472094214, "grad_norm": 0.7714506387710571, "learning_rate": 9.94641386302783e-05, "loss": 0.6576, "step": 832 }, { "epoch": 0.4265232974910394, "grad_norm": 0.9312205910682678, "learning_rate": 9.945978105687433e-05, "loss": 0.6544, "step": 833 }, { "epoch": 0.4270353302611367, "grad_norm": 0.6953701376914978, "learning_rate": 9.945540593369055e-05, "loss": 0.6618, "step": 834 }, { "epoch": 0.427547363031234, "grad_norm": 0.8925039172172546, "learning_rate": 9.945101326227944e-05, "loss": 0.653, "step": 835 }, { "epoch": 0.4280593958013313, "grad_norm": 1.030678153038025, "learning_rate": 9.944660304419961e-05, "loss": 0.6684, "step": 836 }, { "epoch": 0.42857142857142855, "grad_norm": 0.7707819938659668, "learning_rate": 9.944217528101594e-05, "loss": 0.6903, "step": 837 }, { "epoch": 0.42908346134152586, "grad_norm": 0.8419348001480103, "learning_rate": 9.943772997429954e-05, "loss": 0.6444, "step": 838 }, { "epoch": 0.4295954941116231, "grad_norm": 0.7174859046936035, "learning_rate": 9.943326712562772e-05, "loss": 0.6353, "step": 839 }, { "epoch": 0.43010752688172044, "grad_norm": 0.7022908926010132, "learning_rate": 9.942878673658407e-05, "loss": 0.6366, "step": 840 }, { "epoch": 0.4306195596518177, "grad_norm": 0.5674614310264587, "learning_rate": 9.942428880875831e-05, "loss": 0.6351, "step": 841 }, { "epoch": 0.431131592421915, "grad_norm": 0.8667612075805664, "learning_rate": 9.941977334374645e-05, "loss": 0.6405, "step": 842 }, { "epoch": 0.43164362519201227, "grad_norm": 0.8458070158958435, "learning_rate": 9.941524034315073e-05, "loss": 0.667, "step": 843 }, { "epoch": 0.4321556579621096, "grad_norm": 0.6915360689163208, "learning_rate": 9.94106898085796e-05, "loss": 0.6549, "step": 844 }, { "epoch": 0.43266769073220684, "grad_norm": 0.6892876625061035, "learning_rate": 9.940612174164768e-05, "loss": 0.6518, "step": 845 }, { "epoch": 0.43317972350230416, "grad_norm": 0.5199620127677917, "learning_rate": 9.940153614397587e-05, "loss": 0.6511, "step": 846 }, { "epoch": 0.4336917562724014, "grad_norm": 0.8069517016410828, "learning_rate": 9.939693301719131e-05, "loss": 0.6245, "step": 847 }, { "epoch": 0.43420378904249873, "grad_norm": 0.8564416170120239, "learning_rate": 9.939231236292727e-05, "loss": 0.6568, "step": 848 }, { "epoch": 0.434715821812596, "grad_norm": 1.2015972137451172, "learning_rate": 9.938767418282333e-05, "loss": 0.6816, "step": 849 }, { "epoch": 0.4352278545826933, "grad_norm": 0.7293360829353333, "learning_rate": 9.938301847852525e-05, "loss": 0.633, "step": 850 }, { "epoch": 0.43573988735279057, "grad_norm": 0.7780644297599792, "learning_rate": 9.9378345251685e-05, "loss": 0.6311, "step": 851 }, { "epoch": 0.4362519201228879, "grad_norm": 1.0365400314331055, "learning_rate": 9.937365450396078e-05, "loss": 0.6548, "step": 852 }, { "epoch": 0.43676395289298514, "grad_norm": 0.7629266977310181, "learning_rate": 9.9368946237017e-05, "loss": 0.659, "step": 853 }, { "epoch": 0.43727598566308246, "grad_norm": 0.6457581520080566, "learning_rate": 9.93642204525243e-05, "loss": 0.6402, "step": 854 }, { "epoch": 0.4377880184331797, "grad_norm": 0.6640198230743408, "learning_rate": 9.935947715215952e-05, "loss": 0.6653, "step": 855 }, { "epoch": 0.43830005120327703, "grad_norm": 0.6551440358161926, "learning_rate": 9.935471633760573e-05, "loss": 0.6526, "step": 856 }, { "epoch": 0.4388120839733743, "grad_norm": 0.5644727945327759, "learning_rate": 9.93499380105522e-05, "loss": 0.6391, "step": 857 }, { "epoch": 0.4393241167434716, "grad_norm": 0.7183897495269775, "learning_rate": 9.934514217269441e-05, "loss": 0.676, "step": 858 }, { "epoch": 0.43983614951356886, "grad_norm": 0.9220442771911621, "learning_rate": 9.934032882573409e-05, "loss": 0.6493, "step": 859 }, { "epoch": 0.4403481822836662, "grad_norm": 0.8930736184120178, "learning_rate": 9.933549797137914e-05, "loss": 0.6567, "step": 860 }, { "epoch": 0.44086021505376344, "grad_norm": 0.7974289655685425, "learning_rate": 9.933064961134368e-05, "loss": 0.6299, "step": 861 }, { "epoch": 0.44137224782386075, "grad_norm": 0.49691009521484375, "learning_rate": 9.932578374734807e-05, "loss": 0.649, "step": 862 }, { "epoch": 0.441884280593958, "grad_norm": 0.651841938495636, "learning_rate": 9.932090038111886e-05, "loss": 0.6338, "step": 863 }, { "epoch": 0.4423963133640553, "grad_norm": 0.8254738450050354, "learning_rate": 9.93159995143888e-05, "loss": 0.6376, "step": 864 }, { "epoch": 0.4429083461341526, "grad_norm": 0.9831904768943787, "learning_rate": 9.931108114889685e-05, "loss": 0.6512, "step": 865 }, { "epoch": 0.44342037890424985, "grad_norm": 0.8030515313148499, "learning_rate": 9.930614528638821e-05, "loss": 0.6608, "step": 866 }, { "epoch": 0.44393241167434716, "grad_norm": 0.6627100706100464, "learning_rate": 9.930119192861428e-05, "loss": 0.6552, "step": 867 }, { "epoch": 0.4444444444444444, "grad_norm": 0.6755707859992981, "learning_rate": 9.929622107733263e-05, "loss": 0.6552, "step": 868 }, { "epoch": 0.44495647721454173, "grad_norm": 0.8348547220230103, "learning_rate": 9.929123273430707e-05, "loss": 0.6459, "step": 869 }, { "epoch": 0.445468509984639, "grad_norm": 1.1250085830688477, "learning_rate": 9.928622690130762e-05, "loss": 0.6759, "step": 870 }, { "epoch": 0.4459805427547363, "grad_norm": 0.7750677466392517, "learning_rate": 9.928120358011049e-05, "loss": 0.6431, "step": 871 }, { "epoch": 0.44649257552483357, "grad_norm": 0.5312778353691101, "learning_rate": 9.92761627724981e-05, "loss": 0.6441, "step": 872 }, { "epoch": 0.4470046082949309, "grad_norm": 0.9415028095245361, "learning_rate": 9.927110448025909e-05, "loss": 0.6428, "step": 873 }, { "epoch": 0.44751664106502814, "grad_norm": 1.0302003622055054, "learning_rate": 9.926602870518826e-05, "loss": 0.6414, "step": 874 }, { "epoch": 0.44802867383512546, "grad_norm": 0.811150848865509, "learning_rate": 9.926093544908668e-05, "loss": 0.6683, "step": 875 }, { "epoch": 0.4485407066052227, "grad_norm": 0.6946927905082703, "learning_rate": 9.925582471376155e-05, "loss": 0.651, "step": 876 }, { "epoch": 0.44905273937532003, "grad_norm": 0.7361304759979248, "learning_rate": 9.925069650102634e-05, "loss": 0.6562, "step": 877 }, { "epoch": 0.4495647721454173, "grad_norm": 0.7793813347816467, "learning_rate": 9.924555081270068e-05, "loss": 0.6397, "step": 878 }, { "epoch": 0.4500768049155146, "grad_norm": 0.6320583820343018, "learning_rate": 9.924038765061042e-05, "loss": 0.6306, "step": 879 }, { "epoch": 0.45058883768561186, "grad_norm": 0.7030929923057556, "learning_rate": 9.923520701658756e-05, "loss": 0.6216, "step": 880 }, { "epoch": 0.4511008704557092, "grad_norm": 0.5528194308280945, "learning_rate": 9.923000891247038e-05, "loss": 0.6328, "step": 881 }, { "epoch": 0.45161290322580644, "grad_norm": 0.6554601788520813, "learning_rate": 9.922479334010333e-05, "loss": 0.657, "step": 882 }, { "epoch": 0.45212493599590375, "grad_norm": 0.6622177362442017, "learning_rate": 9.921956030133701e-05, "loss": 0.6544, "step": 883 }, { "epoch": 0.452636968766001, "grad_norm": 0.6845796704292297, "learning_rate": 9.921430979802828e-05, "loss": 0.6297, "step": 884 }, { "epoch": 0.45314900153609833, "grad_norm": 0.7652627229690552, "learning_rate": 9.920904183204016e-05, "loss": 0.6416, "step": 885 }, { "epoch": 0.4536610343061956, "grad_norm": 0.7823075652122498, "learning_rate": 9.92037564052419e-05, "loss": 0.6383, "step": 886 }, { "epoch": 0.4541730670762929, "grad_norm": 0.7564027309417725, "learning_rate": 9.919845351950889e-05, "loss": 0.6267, "step": 887 }, { "epoch": 0.45468509984639016, "grad_norm": 0.6074973940849304, "learning_rate": 9.919313317672278e-05, "loss": 0.6266, "step": 888 }, { "epoch": 0.4551971326164875, "grad_norm": 0.5559996366500854, "learning_rate": 9.918779537877139e-05, "loss": 0.6375, "step": 889 }, { "epoch": 0.45570916538658474, "grad_norm": 0.4719943106174469, "learning_rate": 9.918244012754868e-05, "loss": 0.6318, "step": 890 }, { "epoch": 0.45622119815668205, "grad_norm": 0.5442924499511719, "learning_rate": 9.917706742495489e-05, "loss": 0.636, "step": 891 }, { "epoch": 0.4567332309267793, "grad_norm": 0.7651833891868591, "learning_rate": 9.91716772728964e-05, "loss": 0.6585, "step": 892 }, { "epoch": 0.4572452636968766, "grad_norm": 0.8152490258216858, "learning_rate": 9.916626967328581e-05, "loss": 0.6477, "step": 893 }, { "epoch": 0.4577572964669739, "grad_norm": 0.6866008043289185, "learning_rate": 9.916084462804186e-05, "loss": 0.6586, "step": 894 }, { "epoch": 0.4582693292370712, "grad_norm": 0.6732768416404724, "learning_rate": 9.915540213908955e-05, "loss": 0.6424, "step": 895 }, { "epoch": 0.45878136200716846, "grad_norm": 0.6528488993644714, "learning_rate": 9.914994220836001e-05, "loss": 0.6525, "step": 896 }, { "epoch": 0.4592933947772657, "grad_norm": 0.7473706603050232, "learning_rate": 9.914446483779061e-05, "loss": 0.652, "step": 897 }, { "epoch": 0.45980542754736303, "grad_norm": 0.7758312225341797, "learning_rate": 9.913897002932483e-05, "loss": 0.6573, "step": 898 }, { "epoch": 0.4603174603174603, "grad_norm": 1.080485224723816, "learning_rate": 9.913345778491245e-05, "loss": 0.6204, "step": 899 }, { "epoch": 0.4608294930875576, "grad_norm": 0.7469905018806458, "learning_rate": 9.912792810650936e-05, "loss": 0.6646, "step": 900 }, { "epoch": 0.46134152585765487, "grad_norm": 0.5518008470535278, "learning_rate": 9.912238099607763e-05, "loss": 0.668, "step": 901 }, { "epoch": 0.4618535586277522, "grad_norm": 0.7490918040275574, "learning_rate": 9.911681645558554e-05, "loss": 0.6545, "step": 902 }, { "epoch": 0.46236559139784944, "grad_norm": 0.7505640983581543, "learning_rate": 9.911123448700759e-05, "loss": 0.6417, "step": 903 }, { "epoch": 0.46287762416794676, "grad_norm": 0.7138315439224243, "learning_rate": 9.910563509232437e-05, "loss": 0.6286, "step": 904 }, { "epoch": 0.463389656938044, "grad_norm": 0.5319908857345581, "learning_rate": 9.910001827352275e-05, "loss": 0.6582, "step": 905 }, { "epoch": 0.46390168970814133, "grad_norm": 0.7605199813842773, "learning_rate": 9.909438403259572e-05, "loss": 0.6389, "step": 906 }, { "epoch": 0.4644137224782386, "grad_norm": 0.8155522346496582, "learning_rate": 9.908873237154248e-05, "loss": 0.6212, "step": 907 }, { "epoch": 0.4649257552483359, "grad_norm": 0.6848049759864807, "learning_rate": 9.908306329236842e-05, "loss": 0.6475, "step": 908 }, { "epoch": 0.46543778801843316, "grad_norm": 0.8671876788139343, "learning_rate": 9.907737679708507e-05, "loss": 0.6422, "step": 909 }, { "epoch": 0.4659498207885305, "grad_norm": 0.8425745964050293, "learning_rate": 9.907167288771019e-05, "loss": 0.6425, "step": 910 }, { "epoch": 0.46646185355862774, "grad_norm": 1.0046098232269287, "learning_rate": 9.906595156626768e-05, "loss": 0.6611, "step": 911 }, { "epoch": 0.46697388632872505, "grad_norm": 1.0840588808059692, "learning_rate": 9.906021283478764e-05, "loss": 0.6618, "step": 912 }, { "epoch": 0.4674859190988223, "grad_norm": 0.7240006923675537, "learning_rate": 9.905445669530632e-05, "loss": 0.6666, "step": 913 }, { "epoch": 0.4679979518689196, "grad_norm": 0.5935433506965637, "learning_rate": 9.904868314986619e-05, "loss": 0.6577, "step": 914 }, { "epoch": 0.4685099846390169, "grad_norm": 0.6838778853416443, "learning_rate": 9.904289220051585e-05, "loss": 0.6117, "step": 915 }, { "epoch": 0.4690220174091142, "grad_norm": 1.1387567520141602, "learning_rate": 9.903708384931014e-05, "loss": 0.6511, "step": 916 }, { "epoch": 0.46953405017921146, "grad_norm": 0.8208044767379761, "learning_rate": 9.903125809831e-05, "loss": 0.6574, "step": 917 }, { "epoch": 0.4700460829493088, "grad_norm": 0.7305214405059814, "learning_rate": 9.902541494958257e-05, "loss": 0.6387, "step": 918 }, { "epoch": 0.47055811571940603, "grad_norm": 0.7003597617149353, "learning_rate": 9.90195544052012e-05, "loss": 0.6404, "step": 919 }, { "epoch": 0.47107014848950335, "grad_norm": 0.7405287027359009, "learning_rate": 9.901367646724535e-05, "loss": 0.646, "step": 920 }, { "epoch": 0.4715821812596006, "grad_norm": 0.7612846493721008, "learning_rate": 9.900778113780072e-05, "loss": 0.6452, "step": 921 }, { "epoch": 0.4720942140296979, "grad_norm": 0.7146367430686951, "learning_rate": 9.900186841895912e-05, "loss": 0.6327, "step": 922 }, { "epoch": 0.4726062467997952, "grad_norm": 0.7989993095397949, "learning_rate": 9.899593831281857e-05, "loss": 0.6315, "step": 923 }, { "epoch": 0.4731182795698925, "grad_norm": 0.7013024091720581, "learning_rate": 9.898999082148323e-05, "loss": 0.6485, "step": 924 }, { "epoch": 0.47363031233998976, "grad_norm": 0.7963703870773315, "learning_rate": 9.898402594706345e-05, "loss": 0.6344, "step": 925 }, { "epoch": 0.47414234511008707, "grad_norm": 1.038550615310669, "learning_rate": 9.897804369167577e-05, "loss": 0.6569, "step": 926 }, { "epoch": 0.47465437788018433, "grad_norm": 0.9362043142318726, "learning_rate": 9.897204405744282e-05, "loss": 0.6199, "step": 927 }, { "epoch": 0.47516641065028165, "grad_norm": 0.7293409705162048, "learning_rate": 9.896602704649346e-05, "loss": 0.6523, "step": 928 }, { "epoch": 0.4756784434203789, "grad_norm": 0.6063665151596069, "learning_rate": 9.895999266096275e-05, "loss": 0.6249, "step": 929 }, { "epoch": 0.47619047619047616, "grad_norm": 0.5471436977386475, "learning_rate": 9.895394090299179e-05, "loss": 0.6205, "step": 930 }, { "epoch": 0.4767025089605735, "grad_norm": 0.9308580756187439, "learning_rate": 9.8947871774728e-05, "loss": 0.6403, "step": 931 }, { "epoch": 0.47721454173067074, "grad_norm": 1.0126181840896606, "learning_rate": 9.894178527832482e-05, "loss": 0.656, "step": 932 }, { "epoch": 0.47772657450076805, "grad_norm": 0.9440600872039795, "learning_rate": 9.893568141594194e-05, "loss": 0.637, "step": 933 }, { "epoch": 0.4782386072708653, "grad_norm": 0.6618737578392029, "learning_rate": 9.89295601897452e-05, "loss": 0.6322, "step": 934 }, { "epoch": 0.4787506400409626, "grad_norm": 0.5759153962135315, "learning_rate": 9.89234216019066e-05, "loss": 0.6251, "step": 935 }, { "epoch": 0.4792626728110599, "grad_norm": 1.1696995496749878, "learning_rate": 9.891726565460425e-05, "loss": 0.6528, "step": 936 }, { "epoch": 0.4797747055811572, "grad_norm": 0.9785498976707458, "learning_rate": 9.891109235002249e-05, "loss": 0.6487, "step": 937 }, { "epoch": 0.48028673835125446, "grad_norm": 0.8788800239562988, "learning_rate": 9.890490169035178e-05, "loss": 0.6637, "step": 938 }, { "epoch": 0.4807987711213518, "grad_norm": 0.8094038367271423, "learning_rate": 9.889869367778874e-05, "loss": 0.6455, "step": 939 }, { "epoch": 0.48131080389144903, "grad_norm": 0.5184980630874634, "learning_rate": 9.889246831453619e-05, "loss": 0.6696, "step": 940 }, { "epoch": 0.48182283666154635, "grad_norm": 0.5724202990531921, "learning_rate": 9.888622560280303e-05, "loss": 0.6566, "step": 941 }, { "epoch": 0.4823348694316436, "grad_norm": 0.7220712900161743, "learning_rate": 9.887996554480439e-05, "loss": 0.6256, "step": 942 }, { "epoch": 0.4828469022017409, "grad_norm": 0.6505951881408691, "learning_rate": 9.887368814276148e-05, "loss": 0.6513, "step": 943 }, { "epoch": 0.4833589349718382, "grad_norm": 0.9233887791633606, "learning_rate": 9.886739339890175e-05, "loss": 0.6426, "step": 944 }, { "epoch": 0.4838709677419355, "grad_norm": 0.9895170331001282, "learning_rate": 9.886108131545875e-05, "loss": 0.6541, "step": 945 }, { "epoch": 0.48438300051203276, "grad_norm": 0.5814738273620605, "learning_rate": 9.885475189467217e-05, "loss": 0.6105, "step": 946 }, { "epoch": 0.4848950332821301, "grad_norm": 0.6733271479606628, "learning_rate": 9.884840513878789e-05, "loss": 0.6483, "step": 947 }, { "epoch": 0.48540706605222733, "grad_norm": 0.7978957295417786, "learning_rate": 9.884204105005792e-05, "loss": 0.6215, "step": 948 }, { "epoch": 0.48591909882232465, "grad_norm": 0.5545995831489563, "learning_rate": 9.883565963074042e-05, "loss": 0.6478, "step": 949 }, { "epoch": 0.4864311315924219, "grad_norm": 0.6174678206443787, "learning_rate": 9.882926088309972e-05, "loss": 0.6176, "step": 950 }, { "epoch": 0.4869431643625192, "grad_norm": 0.7186322808265686, "learning_rate": 9.882284480940629e-05, "loss": 0.6276, "step": 951 }, { "epoch": 0.4874551971326165, "grad_norm": 0.6952177882194519, "learning_rate": 9.881641141193671e-05, "loss": 0.6421, "step": 952 }, { "epoch": 0.4879672299027138, "grad_norm": 0.6972012519836426, "learning_rate": 9.880996069297376e-05, "loss": 0.6173, "step": 953 }, { "epoch": 0.48847926267281105, "grad_norm": 0.6358770728111267, "learning_rate": 9.880349265480634e-05, "loss": 0.6307, "step": 954 }, { "epoch": 0.48899129544290837, "grad_norm": 0.6439440846443176, "learning_rate": 9.879700729972949e-05, "loss": 0.6292, "step": 955 }, { "epoch": 0.48950332821300563, "grad_norm": 0.5934160947799683, "learning_rate": 9.879050463004442e-05, "loss": 0.6261, "step": 956 }, { "epoch": 0.49001536098310294, "grad_norm": 0.6063875555992126, "learning_rate": 9.878398464805847e-05, "loss": 0.6626, "step": 957 }, { "epoch": 0.4905273937532002, "grad_norm": 0.6481578350067139, "learning_rate": 9.87774473560851e-05, "loss": 0.6539, "step": 958 }, { "epoch": 0.4910394265232975, "grad_norm": 0.7140811681747437, "learning_rate": 9.877089275644393e-05, "loss": 0.6298, "step": 959 }, { "epoch": 0.4915514592933948, "grad_norm": 0.8066103458404541, "learning_rate": 9.876432085146076e-05, "loss": 0.6425, "step": 960 }, { "epoch": 0.49206349206349204, "grad_norm": 0.6440595388412476, "learning_rate": 9.875773164346747e-05, "loss": 0.6305, "step": 961 }, { "epoch": 0.49257552483358935, "grad_norm": 0.5308674573898315, "learning_rate": 9.875112513480209e-05, "loss": 0.6174, "step": 962 }, { "epoch": 0.4930875576036866, "grad_norm": 0.6084829568862915, "learning_rate": 9.874450132780883e-05, "loss": 0.6406, "step": 963 }, { "epoch": 0.4935995903737839, "grad_norm": 0.9061683416366577, "learning_rate": 9.8737860224838e-05, "loss": 0.6798, "step": 964 }, { "epoch": 0.4941116231438812, "grad_norm": 0.7542415261268616, "learning_rate": 9.873120182824605e-05, "loss": 0.622, "step": 965 }, { "epoch": 0.4946236559139785, "grad_norm": 0.47065630555152893, "learning_rate": 9.872452614039557e-05, "loss": 0.6236, "step": 966 }, { "epoch": 0.49513568868407576, "grad_norm": 0.7724887728691101, "learning_rate": 9.871783316365533e-05, "loss": 0.6189, "step": 967 }, { "epoch": 0.4956477214541731, "grad_norm": 0.7198125720024109, "learning_rate": 9.871112290040017e-05, "loss": 0.6303, "step": 968 }, { "epoch": 0.49615975422427033, "grad_norm": 0.5780414938926697, "learning_rate": 9.870439535301105e-05, "loss": 0.6369, "step": 969 }, { "epoch": 0.49667178699436765, "grad_norm": 0.6796943545341492, "learning_rate": 9.869765052387516e-05, "loss": 0.6257, "step": 970 }, { "epoch": 0.4971838197644649, "grad_norm": 0.6879675388336182, "learning_rate": 9.869088841538574e-05, "loss": 0.6358, "step": 971 }, { "epoch": 0.4976958525345622, "grad_norm": 0.5937290787696838, "learning_rate": 9.868410902994217e-05, "loss": 0.6231, "step": 972 }, { "epoch": 0.4982078853046595, "grad_norm": 0.8198040723800659, "learning_rate": 9.867731236994999e-05, "loss": 0.6412, "step": 973 }, { "epoch": 0.4987199180747568, "grad_norm": 0.8082607984542847, "learning_rate": 9.867049843782086e-05, "loss": 0.6629, "step": 974 }, { "epoch": 0.49923195084485406, "grad_norm": 0.6644940972328186, "learning_rate": 9.866366723597253e-05, "loss": 0.6194, "step": 975 }, { "epoch": 0.49974398361495137, "grad_norm": 0.7274144887924194, "learning_rate": 9.865681876682897e-05, "loss": 0.6228, "step": 976 }, { "epoch": 0.5002560163850487, "grad_norm": 0.6104744076728821, "learning_rate": 9.864995303282017e-05, "loss": 0.6151, "step": 977 }, { "epoch": 0.500768049155146, "grad_norm": 0.752384603023529, "learning_rate": 9.864307003638229e-05, "loss": 0.6312, "step": 978 }, { "epoch": 0.5012800819252432, "grad_norm": 1.0827866792678833, "learning_rate": 9.863616977995764e-05, "loss": 0.6293, "step": 979 }, { "epoch": 0.5017921146953405, "grad_norm": 1.254512071609497, "learning_rate": 9.862925226599463e-05, "loss": 0.6389, "step": 980 }, { "epoch": 0.5023041474654378, "grad_norm": 0.7635893225669861, "learning_rate": 9.862231749694779e-05, "loss": 0.6461, "step": 981 }, { "epoch": 0.5028161802355351, "grad_norm": 0.6431967616081238, "learning_rate": 9.861536547527778e-05, "loss": 0.6337, "step": 982 }, { "epoch": 0.5033282130056324, "grad_norm": 1.1328961849212646, "learning_rate": 9.860839620345141e-05, "loss": 0.6545, "step": 983 }, { "epoch": 0.5038402457757296, "grad_norm": 1.0121971368789673, "learning_rate": 9.860140968394154e-05, "loss": 0.6468, "step": 984 }, { "epoch": 0.504352278545827, "grad_norm": 0.6751275658607483, "learning_rate": 9.859440591922721e-05, "loss": 0.6309, "step": 985 }, { "epoch": 0.5048643113159242, "grad_norm": 0.6840667128562927, "learning_rate": 9.858738491179356e-05, "loss": 0.6266, "step": 986 }, { "epoch": 0.5053763440860215, "grad_norm": 1.1466436386108398, "learning_rate": 9.858034666413186e-05, "loss": 0.6336, "step": 987 }, { "epoch": 0.5058883768561188, "grad_norm": 0.6939874887466431, "learning_rate": 9.857329117873945e-05, "loss": 0.6773, "step": 988 }, { "epoch": 0.5064004096262161, "grad_norm": 0.5015890598297119, "learning_rate": 9.856621845811986e-05, "loss": 0.671, "step": 989 }, { "epoch": 0.5069124423963134, "grad_norm": 0.8129735589027405, "learning_rate": 9.855912850478268e-05, "loss": 0.6335, "step": 990 }, { "epoch": 0.5074244751664106, "grad_norm": 0.9700906872749329, "learning_rate": 9.855202132124365e-05, "loss": 0.6356, "step": 991 }, { "epoch": 0.5079365079365079, "grad_norm": 0.9450486898422241, "learning_rate": 9.85448969100246e-05, "loss": 0.6506, "step": 992 }, { "epoch": 0.5084485407066052, "grad_norm": 0.7867761254310608, "learning_rate": 9.853775527365347e-05, "loss": 0.6251, "step": 993 }, { "epoch": 0.5089605734767025, "grad_norm": 0.536848783493042, "learning_rate": 9.853059641466434e-05, "loss": 0.627, "step": 994 }, { "epoch": 0.5094726062467998, "grad_norm": 0.5582588911056519, "learning_rate": 9.852342033559735e-05, "loss": 0.6281, "step": 995 }, { "epoch": 0.5099846390168971, "grad_norm": 0.5431452393531799, "learning_rate": 9.851622703899882e-05, "loss": 0.6366, "step": 996 }, { "epoch": 0.5104966717869943, "grad_norm": 0.5822605490684509, "learning_rate": 9.850901652742113e-05, "loss": 0.6408, "step": 997 }, { "epoch": 0.5110087045570917, "grad_norm": 0.6007508635520935, "learning_rate": 9.850178880342275e-05, "loss": 0.6492, "step": 998 }, { "epoch": 0.511520737327189, "grad_norm": 0.5886169672012329, "learning_rate": 9.849454386956835e-05, "loss": 0.6561, "step": 999 }, { "epoch": 0.5120327700972862, "grad_norm": 0.6438140273094177, "learning_rate": 9.84872817284286e-05, "loss": 0.6093, "step": 1000 }, { "epoch": 0.5125448028673835, "grad_norm": 0.7203197479248047, "learning_rate": 9.848000238258033e-05, "loss": 0.6284, "step": 1001 }, { "epoch": 0.5130568356374808, "grad_norm": 0.6517729759216309, "learning_rate": 9.847270583460648e-05, "loss": 0.6336, "step": 1002 }, { "epoch": 0.5135688684075781, "grad_norm": 0.49625805020332336, "learning_rate": 9.846539208709604e-05, "loss": 0.6092, "step": 1003 }, { "epoch": 0.5140809011776754, "grad_norm": 0.6172817945480347, "learning_rate": 9.845806114264418e-05, "loss": 0.6235, "step": 1004 }, { "epoch": 0.5145929339477726, "grad_norm": 0.5836978554725647, "learning_rate": 9.845071300385213e-05, "loss": 0.6266, "step": 1005 }, { "epoch": 0.51510496671787, "grad_norm": 0.9000675082206726, "learning_rate": 9.844334767332721e-05, "loss": 0.6422, "step": 1006 }, { "epoch": 0.5156169994879672, "grad_norm": 0.7782149910926819, "learning_rate": 9.843596515368287e-05, "loss": 0.6112, "step": 1007 }, { "epoch": 0.5161290322580645, "grad_norm": 0.7996637225151062, "learning_rate": 9.842856544753862e-05, "loss": 0.6368, "step": 1008 }, { "epoch": 0.5166410650281618, "grad_norm": 0.7709027528762817, "learning_rate": 9.842114855752012e-05, "loss": 0.6469, "step": 1009 }, { "epoch": 0.5171530977982591, "grad_norm": 0.9016855359077454, "learning_rate": 9.841371448625909e-05, "loss": 0.6285, "step": 1010 }, { "epoch": 0.5176651305683564, "grad_norm": 0.8317198753356934, "learning_rate": 9.840626323639334e-05, "loss": 0.6426, "step": 1011 }, { "epoch": 0.5181771633384537, "grad_norm": 0.7697166800498962, "learning_rate": 9.839879481056682e-05, "loss": 0.6446, "step": 1012 }, { "epoch": 0.5186891961085509, "grad_norm": 0.7397597432136536, "learning_rate": 9.839130921142953e-05, "loss": 0.6515, "step": 1013 }, { "epoch": 0.5192012288786483, "grad_norm": 1.0622540712356567, "learning_rate": 9.83838064416376e-05, "loss": 0.6384, "step": 1014 }, { "epoch": 0.5197132616487455, "grad_norm": 0.7562738060951233, "learning_rate": 9.83762865038532e-05, "loss": 0.6147, "step": 1015 }, { "epoch": 0.5202252944188428, "grad_norm": 0.5041040182113647, "learning_rate": 9.836874940074466e-05, "loss": 0.6318, "step": 1016 }, { "epoch": 0.5207373271889401, "grad_norm": 0.8384548425674438, "learning_rate": 9.836119513498631e-05, "loss": 0.6188, "step": 1017 }, { "epoch": 0.5212493599590374, "grad_norm": 0.913390040397644, "learning_rate": 9.835362370925868e-05, "loss": 0.6447, "step": 1018 }, { "epoch": 0.5217613927291347, "grad_norm": 0.5998526811599731, "learning_rate": 9.834603512624832e-05, "loss": 0.6423, "step": 1019 }, { "epoch": 0.522273425499232, "grad_norm": 0.5891745686531067, "learning_rate": 9.833842938864785e-05, "loss": 0.6386, "step": 1020 }, { "epoch": 0.5227854582693292, "grad_norm": 0.7420984506607056, "learning_rate": 9.833080649915606e-05, "loss": 0.6489, "step": 1021 }, { "epoch": 0.5232974910394266, "grad_norm": 0.8549829721450806, "learning_rate": 9.832316646047774e-05, "loss": 0.6383, "step": 1022 }, { "epoch": 0.5238095238095238, "grad_norm": 0.7601368427276611, "learning_rate": 9.83155092753238e-05, "loss": 0.6447, "step": 1023 }, { "epoch": 0.5243215565796211, "grad_norm": 0.552717387676239, "learning_rate": 9.830783494641126e-05, "loss": 0.6284, "step": 1024 }, { "epoch": 0.5248335893497184, "grad_norm": 0.9253878593444824, "learning_rate": 9.830014347646316e-05, "loss": 0.6402, "step": 1025 }, { "epoch": 0.5253456221198156, "grad_norm": 0.9387562274932861, "learning_rate": 9.829243486820869e-05, "loss": 0.6266, "step": 1026 }, { "epoch": 0.525857654889913, "grad_norm": 1.065827488899231, "learning_rate": 9.828470912438308e-05, "loss": 0.6345, "step": 1027 }, { "epoch": 0.5263696876600102, "grad_norm": 0.8695163726806641, "learning_rate": 9.827696624772764e-05, "loss": 0.6494, "step": 1028 }, { "epoch": 0.5268817204301075, "grad_norm": 0.9098474979400635, "learning_rate": 9.826920624098978e-05, "loss": 0.6406, "step": 1029 }, { "epoch": 0.5273937532002048, "grad_norm": 0.9644160270690918, "learning_rate": 9.826142910692298e-05, "loss": 0.6336, "step": 1030 }, { "epoch": 0.5279057859703021, "grad_norm": 0.9291470050811768, "learning_rate": 9.825363484828678e-05, "loss": 0.6173, "step": 1031 }, { "epoch": 0.5284178187403994, "grad_norm": 1.0693367719650269, "learning_rate": 9.824582346784683e-05, "loss": 0.6515, "step": 1032 }, { "epoch": 0.5289298515104967, "grad_norm": 0.9427778720855713, "learning_rate": 9.823799496837484e-05, "loss": 0.6435, "step": 1033 }, { "epoch": 0.5294418842805939, "grad_norm": 0.7754147052764893, "learning_rate": 9.823014935264856e-05, "loss": 0.6552, "step": 1034 }, { "epoch": 0.5299539170506913, "grad_norm": 0.6575049757957458, "learning_rate": 9.822228662345187e-05, "loss": 0.6156, "step": 1035 }, { "epoch": 0.5304659498207885, "grad_norm": 0.8591263294219971, "learning_rate": 9.82144067835747e-05, "loss": 0.627, "step": 1036 }, { "epoch": 0.5309779825908858, "grad_norm": 0.895183801651001, "learning_rate": 9.820650983581303e-05, "loss": 0.6405, "step": 1037 }, { "epoch": 0.5314900153609831, "grad_norm": 0.9775090217590332, "learning_rate": 9.819859578296892e-05, "loss": 0.6499, "step": 1038 }, { "epoch": 0.5320020481310804, "grad_norm": 0.8283900618553162, "learning_rate": 9.819066462785056e-05, "loss": 0.6499, "step": 1039 }, { "epoch": 0.5325140809011777, "grad_norm": 0.7861708998680115, "learning_rate": 9.81827163732721e-05, "loss": 0.6339, "step": 1040 }, { "epoch": 0.533026113671275, "grad_norm": 0.7293587327003479, "learning_rate": 9.817475102205384e-05, "loss": 0.6454, "step": 1041 }, { "epoch": 0.5335381464413722, "grad_norm": 0.6398168802261353, "learning_rate": 9.816676857702212e-05, "loss": 0.6199, "step": 1042 }, { "epoch": 0.5340501792114696, "grad_norm": 0.582253634929657, "learning_rate": 9.815876904100933e-05, "loss": 0.6096, "step": 1043 }, { "epoch": 0.5345622119815668, "grad_norm": 0.7089941501617432, "learning_rate": 9.815075241685396e-05, "loss": 0.6383, "step": 1044 }, { "epoch": 0.5350742447516641, "grad_norm": 0.5586503148078918, "learning_rate": 9.814271870740054e-05, "loss": 0.6257, "step": 1045 }, { "epoch": 0.5355862775217614, "grad_norm": 0.5903128981590271, "learning_rate": 9.813466791549966e-05, "loss": 0.6385, "step": 1046 }, { "epoch": 0.5360983102918587, "grad_norm": 0.5273396372795105, "learning_rate": 9.812660004400796e-05, "loss": 0.6363, "step": 1047 }, { "epoch": 0.536610343061956, "grad_norm": 0.8606099486351013, "learning_rate": 9.811851509578818e-05, "loss": 0.6285, "step": 1048 }, { "epoch": 0.5371223758320532, "grad_norm": 0.8825896382331848, "learning_rate": 9.811041307370909e-05, "loss": 0.6239, "step": 1049 }, { "epoch": 0.5376344086021505, "grad_norm": 0.9341815710067749, "learning_rate": 9.810229398064552e-05, "loss": 0.6351, "step": 1050 }, { "epoch": 0.5381464413722479, "grad_norm": 0.7689720392227173, "learning_rate": 9.809415781947838e-05, "loss": 0.6283, "step": 1051 }, { "epoch": 0.5386584741423451, "grad_norm": 0.7184011936187744, "learning_rate": 9.80860045930946e-05, "loss": 0.6579, "step": 1052 }, { "epoch": 0.5391705069124424, "grad_norm": 1.2499613761901855, "learning_rate": 9.807783430438719e-05, "loss": 0.6202, "step": 1053 }, { "epoch": 0.5396825396825397, "grad_norm": 0.7684334516525269, "learning_rate": 9.80696469562552e-05, "loss": 0.6301, "step": 1054 }, { "epoch": 0.540194572452637, "grad_norm": 0.9585006237030029, "learning_rate": 9.806144255160374e-05, "loss": 0.6464, "step": 1055 }, { "epoch": 0.5407066052227343, "grad_norm": 0.8185453414916992, "learning_rate": 9.805322109334401e-05, "loss": 0.6257, "step": 1056 }, { "epoch": 0.5412186379928315, "grad_norm": 0.8512575626373291, "learning_rate": 9.804498258439318e-05, "loss": 0.6096, "step": 1057 }, { "epoch": 0.5417306707629288, "grad_norm": 1.0834158658981323, "learning_rate": 9.803672702767452e-05, "loss": 0.6215, "step": 1058 }, { "epoch": 0.5422427035330261, "grad_norm": 0.762290894985199, "learning_rate": 9.802845442611737e-05, "loss": 0.6383, "step": 1059 }, { "epoch": 0.5427547363031234, "grad_norm": 0.669750452041626, "learning_rate": 9.802016478265706e-05, "loss": 0.6419, "step": 1060 }, { "epoch": 0.5432667690732207, "grad_norm": 0.9417893886566162, "learning_rate": 9.801185810023503e-05, "loss": 0.6478, "step": 1061 }, { "epoch": 0.543778801843318, "grad_norm": 0.8275420665740967, "learning_rate": 9.80035343817987e-05, "loss": 0.6271, "step": 1062 }, { "epoch": 0.5442908346134152, "grad_norm": 0.7331738471984863, "learning_rate": 9.79951936303016e-05, "loss": 0.5983, "step": 1063 }, { "epoch": 0.5448028673835126, "grad_norm": 0.5509605407714844, "learning_rate": 9.798683584870325e-05, "loss": 0.6217, "step": 1064 }, { "epoch": 0.5453149001536098, "grad_norm": 0.7846843600273132, "learning_rate": 9.797846103996927e-05, "loss": 0.636, "step": 1065 }, { "epoch": 0.5458269329237071, "grad_norm": 0.669933557510376, "learning_rate": 9.797006920707124e-05, "loss": 0.6399, "step": 1066 }, { "epoch": 0.5463389656938044, "grad_norm": 0.6141358613967896, "learning_rate": 9.796166035298684e-05, "loss": 0.6538, "step": 1067 }, { "epoch": 0.5468509984639017, "grad_norm": 0.5318694114685059, "learning_rate": 9.79532344806998e-05, "loss": 0.6189, "step": 1068 }, { "epoch": 0.547363031233999, "grad_norm": 0.6493934392929077, "learning_rate": 9.794479159319987e-05, "loss": 0.6337, "step": 1069 }, { "epoch": 0.5478750640040962, "grad_norm": 0.7129163146018982, "learning_rate": 9.793633169348283e-05, "loss": 0.6197, "step": 1070 }, { "epoch": 0.5483870967741935, "grad_norm": 0.6433539986610413, "learning_rate": 9.792785478455046e-05, "loss": 0.6376, "step": 1071 }, { "epoch": 0.5488991295442909, "grad_norm": 0.568597137928009, "learning_rate": 9.791936086941064e-05, "loss": 0.6346, "step": 1072 }, { "epoch": 0.5494111623143881, "grad_norm": 0.5000737309455872, "learning_rate": 9.791084995107728e-05, "loss": 0.6276, "step": 1073 }, { "epoch": 0.5499231950844854, "grad_norm": 0.6240944862365723, "learning_rate": 9.79023220325703e-05, "loss": 0.6263, "step": 1074 }, { "epoch": 0.5504352278545827, "grad_norm": 0.7012926340103149, "learning_rate": 9.789377711691565e-05, "loss": 0.6381, "step": 1075 }, { "epoch": 0.55094726062468, "grad_norm": 0.6090068221092224, "learning_rate": 9.788521520714529e-05, "loss": 0.6319, "step": 1076 }, { "epoch": 0.5514592933947773, "grad_norm": 0.6522509455680847, "learning_rate": 9.787663630629727e-05, "loss": 0.6429, "step": 1077 }, { "epoch": 0.5519713261648745, "grad_norm": 0.7434224486351013, "learning_rate": 9.786804041741562e-05, "loss": 0.6249, "step": 1078 }, { "epoch": 0.5524833589349718, "grad_norm": 2.728050708770752, "learning_rate": 9.785942754355042e-05, "loss": 0.6417, "step": 1079 }, { "epoch": 0.5529953917050692, "grad_norm": 1.0482674837112427, "learning_rate": 9.785079768775776e-05, "loss": 0.6316, "step": 1080 }, { "epoch": 0.5535074244751664, "grad_norm": 1.8674532175064087, "learning_rate": 9.784215085309976e-05, "loss": 0.6694, "step": 1081 }, { "epoch": 0.5540194572452637, "grad_norm": 1.1397632360458374, "learning_rate": 9.783348704264461e-05, "loss": 0.6417, "step": 1082 }, { "epoch": 0.554531490015361, "grad_norm": 2.53764009475708, "learning_rate": 9.782480625946643e-05, "loss": 0.6548, "step": 1083 }, { "epoch": 0.5550435227854583, "grad_norm": 2.0617332458496094, "learning_rate": 9.781610850664547e-05, "loss": 0.6553, "step": 1084 }, { "epoch": 0.5555555555555556, "grad_norm": 1.1458535194396973, "learning_rate": 9.78073937872679e-05, "loss": 0.6395, "step": 1085 }, { "epoch": 0.5560675883256528, "grad_norm": 1.285351037979126, "learning_rate": 9.779866210442599e-05, "loss": 0.6648, "step": 1086 }, { "epoch": 0.5565796210957501, "grad_norm": 1.7149001359939575, "learning_rate": 9.778991346121797e-05, "loss": 0.6319, "step": 1087 }, { "epoch": 0.5570916538658475, "grad_norm": 1.3041085004806519, "learning_rate": 9.778114786074813e-05, "loss": 0.6338, "step": 1088 }, { "epoch": 0.5576036866359447, "grad_norm": 1.7082310914993286, "learning_rate": 9.777236530612678e-05, "loss": 0.648, "step": 1089 }, { "epoch": 0.558115719406042, "grad_norm": 1.1352910995483398, "learning_rate": 9.776356580047019e-05, "loss": 0.6592, "step": 1090 }, { "epoch": 0.5586277521761392, "grad_norm": 1.9558664560317993, "learning_rate": 9.775474934690072e-05, "loss": 0.6463, "step": 1091 }, { "epoch": 0.5591397849462365, "grad_norm": 1.6284534931182861, "learning_rate": 9.774591594854667e-05, "loss": 0.6494, "step": 1092 }, { "epoch": 0.5596518177163339, "grad_norm": 1.357466697692871, "learning_rate": 9.773706560854242e-05, "loss": 0.6288, "step": 1093 }, { "epoch": 0.5601638504864311, "grad_norm": 1.1903069019317627, "learning_rate": 9.772819833002831e-05, "loss": 0.6273, "step": 1094 }, { "epoch": 0.5606758832565284, "grad_norm": 1.51093590259552, "learning_rate": 9.771931411615072e-05, "loss": 0.6159, "step": 1095 }, { "epoch": 0.5611879160266257, "grad_norm": 1.2882379293441772, "learning_rate": 9.771041297006202e-05, "loss": 0.6634, "step": 1096 }, { "epoch": 0.561699948796723, "grad_norm": 1.7753334045410156, "learning_rate": 9.770149489492062e-05, "loss": 0.6367, "step": 1097 }, { "epoch": 0.5622119815668203, "grad_norm": 1.4440935850143433, "learning_rate": 9.76925598938909e-05, "loss": 0.6322, "step": 1098 }, { "epoch": 0.5627240143369175, "grad_norm": 1.4848462343215942, "learning_rate": 9.768360797014324e-05, "loss": 0.6444, "step": 1099 }, { "epoch": 0.5632360471070148, "grad_norm": 1.2265055179595947, "learning_rate": 9.767463912685407e-05, "loss": 0.6138, "step": 1100 }, { "epoch": 0.5637480798771122, "grad_norm": 1.4012532234191895, "learning_rate": 9.766565336720579e-05, "loss": 0.6313, "step": 1101 }, { "epoch": 0.5642601126472094, "grad_norm": 1.241430640220642, "learning_rate": 9.765665069438682e-05, "loss": 0.6327, "step": 1102 }, { "epoch": 0.5647721454173067, "grad_norm": 1.5843876600265503, "learning_rate": 9.764763111159156e-05, "loss": 0.6131, "step": 1103 }, { "epoch": 0.565284178187404, "grad_norm": 1.4631824493408203, "learning_rate": 9.763859462202043e-05, "loss": 0.6235, "step": 1104 }, { "epoch": 0.5657962109575013, "grad_norm": 1.5984071493148804, "learning_rate": 9.762954122887984e-05, "loss": 0.6363, "step": 1105 }, { "epoch": 0.5663082437275986, "grad_norm": 1.207395076751709, "learning_rate": 9.762047093538219e-05, "loss": 0.6195, "step": 1106 }, { "epoch": 0.5668202764976958, "grad_norm": 1.5179636478424072, "learning_rate": 9.76113837447459e-05, "loss": 0.6299, "step": 1107 }, { "epoch": 0.5673323092677931, "grad_norm": 1.1965807676315308, "learning_rate": 9.760227966019536e-05, "loss": 0.6164, "step": 1108 }, { "epoch": 0.5678443420378905, "grad_norm": 1.6597193479537964, "learning_rate": 9.759315868496098e-05, "loss": 0.6422, "step": 1109 }, { "epoch": 0.5683563748079877, "grad_norm": 1.2287328243255615, "learning_rate": 9.758402082227913e-05, "loss": 0.6437, "step": 1110 }, { "epoch": 0.568868407578085, "grad_norm": 1.6836546659469604, "learning_rate": 9.757486607539221e-05, "loss": 0.6209, "step": 1111 }, { "epoch": 0.5693804403481822, "grad_norm": 1.5366079807281494, "learning_rate": 9.756569444754858e-05, "loss": 0.6423, "step": 1112 }, { "epoch": 0.5698924731182796, "grad_norm": 1.218319058418274, "learning_rate": 9.755650594200261e-05, "loss": 0.6405, "step": 1113 }, { "epoch": 0.5704045058883769, "grad_norm": 1.0355260372161865, "learning_rate": 9.754730056201464e-05, "loss": 0.628, "step": 1114 }, { "epoch": 0.5709165386584741, "grad_norm": 1.600469946861267, "learning_rate": 9.753807831085104e-05, "loss": 0.6211, "step": 1115 }, { "epoch": 0.5714285714285714, "grad_norm": 1.3293061256408691, "learning_rate": 9.752883919178409e-05, "loss": 0.6256, "step": 1116 }, { "epoch": 0.5719406041986688, "grad_norm": 1.7368829250335693, "learning_rate": 9.751958320809213e-05, "loss": 0.6425, "step": 1117 }, { "epoch": 0.572452636968766, "grad_norm": 1.4636058807373047, "learning_rate": 9.751031036305944e-05, "loss": 0.6169, "step": 1118 }, { "epoch": 0.5729646697388633, "grad_norm": 1.1457459926605225, "learning_rate": 9.750102065997631e-05, "loss": 0.6238, "step": 1119 }, { "epoch": 0.5734767025089605, "grad_norm": 1.0848761796951294, "learning_rate": 9.749171410213897e-05, "loss": 0.6129, "step": 1120 }, { "epoch": 0.5739887352790578, "grad_norm": 1.4628567695617676, "learning_rate": 9.74823906928497e-05, "loss": 0.6005, "step": 1121 }, { "epoch": 0.5745007680491552, "grad_norm": 1.2211050987243652, "learning_rate": 9.74730504354167e-05, "loss": 0.6433, "step": 1122 }, { "epoch": 0.5750128008192524, "grad_norm": 1.575066328048706, "learning_rate": 9.746369333315417e-05, "loss": 0.6181, "step": 1123 }, { "epoch": 0.5755248335893497, "grad_norm": 1.4435772895812988, "learning_rate": 9.745431938938227e-05, "loss": 0.6495, "step": 1124 }, { "epoch": 0.576036866359447, "grad_norm": 1.1776121854782104, "learning_rate": 9.744492860742718e-05, "loss": 0.6138, "step": 1125 }, { "epoch": 0.5765488991295443, "grad_norm": 1.0703320503234863, "learning_rate": 9.7435520990621e-05, "loss": 0.6029, "step": 1126 }, { "epoch": 0.5770609318996416, "grad_norm": 1.6608999967575073, "learning_rate": 9.742609654230182e-05, "loss": 0.6396, "step": 1127 }, { "epoch": 0.5775729646697388, "grad_norm": 1.4268723726272583, "learning_rate": 9.741665526581374e-05, "loss": 0.6409, "step": 1128 }, { "epoch": 0.5780849974398361, "grad_norm": 1.224826693534851, "learning_rate": 9.740719716450679e-05, "loss": 0.6209, "step": 1129 }, { "epoch": 0.5785970302099335, "grad_norm": 1.028591513633728, "learning_rate": 9.739772224173696e-05, "loss": 0.6201, "step": 1130 }, { "epoch": 0.5791090629800307, "grad_norm": 1.6508582830429077, "learning_rate": 9.738823050086626e-05, "loss": 0.6231, "step": 1131 }, { "epoch": 0.579621095750128, "grad_norm": 1.3916399478912354, "learning_rate": 9.737872194526263e-05, "loss": 0.6315, "step": 1132 }, { "epoch": 0.5801331285202252, "grad_norm": 1.1340396404266357, "learning_rate": 9.736919657829997e-05, "loss": 0.6281, "step": 1133 }, { "epoch": 0.5806451612903226, "grad_norm": 1.0798519849777222, "learning_rate": 9.735965440335816e-05, "loss": 0.6167, "step": 1134 }, { "epoch": 0.5811571940604199, "grad_norm": 1.262539267539978, "learning_rate": 9.735009542382307e-05, "loss": 0.6207, "step": 1135 }, { "epoch": 0.5816692268305171, "grad_norm": 1.1519051790237427, "learning_rate": 9.73405196430865e-05, "loss": 0.6244, "step": 1136 }, { "epoch": 0.5821812596006144, "grad_norm": 1.406802773475647, "learning_rate": 9.733092706454619e-05, "loss": 0.6389, "step": 1137 }, { "epoch": 0.5826932923707118, "grad_norm": 1.1961232423782349, "learning_rate": 9.732131769160588e-05, "loss": 0.6552, "step": 1138 }, { "epoch": 0.583205325140809, "grad_norm": 1.3692879676818848, "learning_rate": 9.731169152767527e-05, "loss": 0.6181, "step": 1139 }, { "epoch": 0.5837173579109063, "grad_norm": 1.3459694385528564, "learning_rate": 9.730204857616998e-05, "loss": 0.637, "step": 1140 }, { "epoch": 0.5842293906810035, "grad_norm": 1.2084556818008423, "learning_rate": 9.729238884051164e-05, "loss": 0.6129, "step": 1141 }, { "epoch": 0.5847414234511009, "grad_norm": 1.0156534910202026, "learning_rate": 9.72827123241278e-05, "loss": 0.6112, "step": 1142 }, { "epoch": 0.5852534562211982, "grad_norm": 1.2562575340270996, "learning_rate": 9.727301903045197e-05, "loss": 0.6461, "step": 1143 }, { "epoch": 0.5857654889912954, "grad_norm": 0.8750985264778137, "learning_rate": 9.72633089629236e-05, "loss": 0.6217, "step": 1144 }, { "epoch": 0.5862775217613927, "grad_norm": 1.2820980548858643, "learning_rate": 9.725358212498813e-05, "loss": 0.6457, "step": 1145 }, { "epoch": 0.5867895545314901, "grad_norm": 0.8551255464553833, "learning_rate": 9.724383852009691e-05, "loss": 0.6169, "step": 1146 }, { "epoch": 0.5873015873015873, "grad_norm": 1.4076045751571655, "learning_rate": 9.723407815170726e-05, "loss": 0.6373, "step": 1147 }, { "epoch": 0.5878136200716846, "grad_norm": 0.7933321595191956, "learning_rate": 9.722430102328246e-05, "loss": 0.6389, "step": 1148 }, { "epoch": 0.5883256528417818, "grad_norm": 1.4257011413574219, "learning_rate": 9.72145071382917e-05, "loss": 0.6246, "step": 1149 }, { "epoch": 0.5888376856118792, "grad_norm": 1.08174729347229, "learning_rate": 9.720469650021014e-05, "loss": 0.6328, "step": 1150 }, { "epoch": 0.5893497183819765, "grad_norm": 1.5686124563217163, "learning_rate": 9.71948691125189e-05, "loss": 0.6527, "step": 1151 }, { "epoch": 0.5898617511520737, "grad_norm": 1.3679481744766235, "learning_rate": 9.7185024978705e-05, "loss": 0.6378, "step": 1152 }, { "epoch": 0.590373783922171, "grad_norm": 1.1667219400405884, "learning_rate": 9.717516410226145e-05, "loss": 0.6326, "step": 1153 }, { "epoch": 0.5908858166922683, "grad_norm": 0.9898416996002197, "learning_rate": 9.716528648668715e-05, "loss": 0.6434, "step": 1154 }, { "epoch": 0.5913978494623656, "grad_norm": 1.1355302333831787, "learning_rate": 9.7155392135487e-05, "loss": 0.6209, "step": 1155 }, { "epoch": 0.5919098822324629, "grad_norm": 1.039116621017456, "learning_rate": 9.71454810521718e-05, "loss": 0.6166, "step": 1156 }, { "epoch": 0.5924219150025601, "grad_norm": 0.9524765610694885, "learning_rate": 9.713555324025828e-05, "loss": 0.6244, "step": 1157 }, { "epoch": 0.5929339477726574, "grad_norm": 0.9291190505027771, "learning_rate": 9.712560870326911e-05, "loss": 0.6164, "step": 1158 }, { "epoch": 0.5934459805427548, "grad_norm": 0.8266804814338684, "learning_rate": 9.711564744473293e-05, "loss": 0.6107, "step": 1159 }, { "epoch": 0.593958013312852, "grad_norm": 0.8084918260574341, "learning_rate": 9.710566946818427e-05, "loss": 0.6269, "step": 1160 }, { "epoch": 0.5944700460829493, "grad_norm": 0.6779957413673401, "learning_rate": 9.70956747771636e-05, "loss": 0.6165, "step": 1161 }, { "epoch": 0.5949820788530465, "grad_norm": 0.7648487687110901, "learning_rate": 9.708566337521736e-05, "loss": 0.6395, "step": 1162 }, { "epoch": 0.5954941116231439, "grad_norm": 1.0171860456466675, "learning_rate": 9.707563526589784e-05, "loss": 0.6389, "step": 1163 }, { "epoch": 0.5960061443932412, "grad_norm": 0.7516344785690308, "learning_rate": 9.706559045276335e-05, "loss": 0.6315, "step": 1164 }, { "epoch": 0.5965181771633384, "grad_norm": 0.7484946846961975, "learning_rate": 9.705552893937808e-05, "loss": 0.6045, "step": 1165 }, { "epoch": 0.5970302099334357, "grad_norm": 0.6407666206359863, "learning_rate": 9.704545072931214e-05, "loss": 0.6207, "step": 1166 }, { "epoch": 0.5975422427035331, "grad_norm": 0.7840438485145569, "learning_rate": 9.703535582614156e-05, "loss": 0.6276, "step": 1167 }, { "epoch": 0.5980542754736303, "grad_norm": 0.5823659896850586, "learning_rate": 9.702524423344834e-05, "loss": 0.622, "step": 1168 }, { "epoch": 0.5985663082437276, "grad_norm": 0.6570319533348083, "learning_rate": 9.701511595482034e-05, "loss": 0.6244, "step": 1169 }, { "epoch": 0.5990783410138248, "grad_norm": 0.6401359438896179, "learning_rate": 9.700497099385137e-05, "loss": 0.6435, "step": 1170 }, { "epoch": 0.5995903737839222, "grad_norm": 0.5320309996604919, "learning_rate": 9.699480935414119e-05, "loss": 0.6279, "step": 1171 }, { "epoch": 0.6001024065540195, "grad_norm": 0.5487993359565735, "learning_rate": 9.698463103929542e-05, "loss": 0.6045, "step": 1172 }, { "epoch": 0.6006144393241167, "grad_norm": 0.525513231754303, "learning_rate": 9.697443605292564e-05, "loss": 0.6522, "step": 1173 }, { "epoch": 0.601126472094214, "grad_norm": 0.6837470531463623, "learning_rate": 9.696422439864932e-05, "loss": 0.6374, "step": 1174 }, { "epoch": 0.6016385048643114, "grad_norm": 0.592113196849823, "learning_rate": 9.695399608008985e-05, "loss": 0.6199, "step": 1175 }, { "epoch": 0.6021505376344086, "grad_norm": 0.7034818530082703, "learning_rate": 9.694375110087655e-05, "loss": 0.6202, "step": 1176 }, { "epoch": 0.6026625704045059, "grad_norm": 0.43299993872642517, "learning_rate": 9.693348946464463e-05, "loss": 0.6374, "step": 1177 }, { "epoch": 0.6031746031746031, "grad_norm": 0.6703401803970337, "learning_rate": 9.692321117503522e-05, "loss": 0.645, "step": 1178 }, { "epoch": 0.6036866359447005, "grad_norm": 0.742361843585968, "learning_rate": 9.691291623569536e-05, "loss": 0.6255, "step": 1179 }, { "epoch": 0.6041986687147978, "grad_norm": 0.7389101386070251, "learning_rate": 9.690260465027801e-05, "loss": 0.6327, "step": 1180 }, { "epoch": 0.604710701484895, "grad_norm": 0.5236951112747192, "learning_rate": 9.689227642244199e-05, "loss": 0.6208, "step": 1181 }, { "epoch": 0.6052227342549923, "grad_norm": 0.6389103531837463, "learning_rate": 9.688193155585208e-05, "loss": 0.6134, "step": 1182 }, { "epoch": 0.6057347670250897, "grad_norm": 0.9146376252174377, "learning_rate": 9.687157005417895e-05, "loss": 0.6384, "step": 1183 }, { "epoch": 0.6062467997951869, "grad_norm": 0.9438178539276123, "learning_rate": 9.686119192109915e-05, "loss": 0.6261, "step": 1184 }, { "epoch": 0.6067588325652842, "grad_norm": 0.6499063968658447, "learning_rate": 9.685079716029515e-05, "loss": 0.6195, "step": 1185 }, { "epoch": 0.6072708653353814, "grad_norm": 0.583290696144104, "learning_rate": 9.684038577545531e-05, "loss": 0.612, "step": 1186 }, { "epoch": 0.6077828981054787, "grad_norm": 0.7424983382225037, "learning_rate": 9.68299577702739e-05, "loss": 0.6409, "step": 1187 }, { "epoch": 0.6082949308755761, "grad_norm": 0.6459847688674927, "learning_rate": 9.68195131484511e-05, "loss": 0.6303, "step": 1188 }, { "epoch": 0.6088069636456733, "grad_norm": 0.667349100112915, "learning_rate": 9.680905191369292e-05, "loss": 0.6063, "step": 1189 }, { "epoch": 0.6093189964157706, "grad_norm": 0.5919795036315918, "learning_rate": 9.679857406971135e-05, "loss": 0.631, "step": 1190 }, { "epoch": 0.6098310291858678, "grad_norm": 0.46825703978538513, "learning_rate": 9.678807962022424e-05, "loss": 0.6491, "step": 1191 }, { "epoch": 0.6103430619559652, "grad_norm": 0.5804308652877808, "learning_rate": 9.677756856895532e-05, "loss": 0.6263, "step": 1192 }, { "epoch": 0.6108550947260625, "grad_norm": 0.5453972816467285, "learning_rate": 9.67670409196342e-05, "loss": 0.6301, "step": 1193 }, { "epoch": 0.6113671274961597, "grad_norm": 0.58237624168396, "learning_rate": 9.675649667599643e-05, "loss": 0.623, "step": 1194 }, { "epoch": 0.611879160266257, "grad_norm": 0.5746862292289734, "learning_rate": 9.674593584178338e-05, "loss": 0.6335, "step": 1195 }, { "epoch": 0.6123911930363544, "grad_norm": 0.5157731771469116, "learning_rate": 9.673535842074237e-05, "loss": 0.6073, "step": 1196 }, { "epoch": 0.6129032258064516, "grad_norm": 0.45225057005882263, "learning_rate": 9.672476441662657e-05, "loss": 0.644, "step": 1197 }, { "epoch": 0.6134152585765489, "grad_norm": 0.6856796145439148, "learning_rate": 9.671415383319507e-05, "loss": 0.627, "step": 1198 }, { "epoch": 0.6139272913466461, "grad_norm": 0.7683528661727905, "learning_rate": 9.670352667421276e-05, "loss": 0.6358, "step": 1199 }, { "epoch": 0.6144393241167435, "grad_norm": 0.6390611529350281, "learning_rate": 9.669288294345052e-05, "loss": 0.6557, "step": 1200 }, { "epoch": 0.6149513568868408, "grad_norm": 0.49674347043037415, "learning_rate": 9.668222264468503e-05, "loss": 0.6393, "step": 1201 }, { "epoch": 0.615463389656938, "grad_norm": 0.6459245085716248, "learning_rate": 9.667154578169888e-05, "loss": 0.6333, "step": 1202 }, { "epoch": 0.6159754224270353, "grad_norm": 0.948539674282074, "learning_rate": 9.666085235828055e-05, "loss": 0.6287, "step": 1203 }, { "epoch": 0.6164874551971327, "grad_norm": 0.9467887282371521, "learning_rate": 9.665014237822435e-05, "loss": 0.6244, "step": 1204 }, { "epoch": 0.6169994879672299, "grad_norm": 0.6527631282806396, "learning_rate": 9.663941584533052e-05, "loss": 0.6033, "step": 1205 }, { "epoch": 0.6175115207373272, "grad_norm": 0.7409166097640991, "learning_rate": 9.662867276340515e-05, "loss": 0.6307, "step": 1206 }, { "epoch": 0.6180235535074244, "grad_norm": 0.6226073503494263, "learning_rate": 9.661791313626018e-05, "loss": 0.6496, "step": 1207 }, { "epoch": 0.6185355862775218, "grad_norm": 0.6652485728263855, "learning_rate": 9.660713696771345e-05, "loss": 0.6335, "step": 1208 }, { "epoch": 0.6190476190476191, "grad_norm": 0.623690128326416, "learning_rate": 9.659634426158867e-05, "loss": 0.625, "step": 1209 }, { "epoch": 0.6195596518177163, "grad_norm": 0.6000739336013794, "learning_rate": 9.658553502171539e-05, "loss": 0.6383, "step": 1210 }, { "epoch": 0.6200716845878136, "grad_norm": 0.5823901295661926, "learning_rate": 9.657470925192907e-05, "loss": 0.6316, "step": 1211 }, { "epoch": 0.620583717357911, "grad_norm": 0.4955350160598755, "learning_rate": 9.656386695607098e-05, "loss": 0.6235, "step": 1212 }, { "epoch": 0.6210957501280082, "grad_norm": 0.5130161643028259, "learning_rate": 9.655300813798831e-05, "loss": 0.622, "step": 1213 }, { "epoch": 0.6216077828981055, "grad_norm": 0.4140961766242981, "learning_rate": 9.654213280153408e-05, "loss": 0.6312, "step": 1214 }, { "epoch": 0.6221198156682027, "grad_norm": 0.5295277833938599, "learning_rate": 9.653124095056716e-05, "loss": 0.6311, "step": 1215 }, { "epoch": 0.6226318484383001, "grad_norm": 0.596668541431427, "learning_rate": 9.652033258895233e-05, "loss": 0.6212, "step": 1216 }, { "epoch": 0.6231438812083974, "grad_norm": 0.5289850831031799, "learning_rate": 9.650940772056017e-05, "loss": 0.6064, "step": 1217 }, { "epoch": 0.6236559139784946, "grad_norm": 0.575878918170929, "learning_rate": 9.649846634926716e-05, "loss": 0.6497, "step": 1218 }, { "epoch": 0.6241679467485919, "grad_norm": 0.6117584109306335, "learning_rate": 9.64875084789556e-05, "loss": 0.6486, "step": 1219 }, { "epoch": 0.6246799795186891, "grad_norm": 0.6080948710441589, "learning_rate": 9.64765341135137e-05, "loss": 0.6048, "step": 1220 }, { "epoch": 0.6251920122887865, "grad_norm": 0.6568256616592407, "learning_rate": 9.646554325683544e-05, "loss": 0.6197, "step": 1221 }, { "epoch": 0.6257040450588838, "grad_norm": 0.8675112128257751, "learning_rate": 9.645453591282072e-05, "loss": 0.6406, "step": 1222 }, { "epoch": 0.626216077828981, "grad_norm": 0.7040305733680725, "learning_rate": 9.644351208537528e-05, "loss": 0.6294, "step": 1223 }, { "epoch": 0.6267281105990783, "grad_norm": 0.6309606432914734, "learning_rate": 9.643247177841068e-05, "loss": 0.6148, "step": 1224 }, { "epoch": 0.6272401433691757, "grad_norm": 0.7085086703300476, "learning_rate": 9.642141499584435e-05, "loss": 0.6528, "step": 1225 }, { "epoch": 0.6277521761392729, "grad_norm": 0.6404117345809937, "learning_rate": 9.641034174159956e-05, "loss": 0.6507, "step": 1226 }, { "epoch": 0.6282642089093702, "grad_norm": 0.48937246203422546, "learning_rate": 9.639925201960543e-05, "loss": 0.6184, "step": 1227 }, { "epoch": 0.6287762416794674, "grad_norm": 0.537635087966919, "learning_rate": 9.63881458337969e-05, "loss": 0.6234, "step": 1228 }, { "epoch": 0.6292882744495648, "grad_norm": 0.7003449201583862, "learning_rate": 9.63770231881148e-05, "loss": 0.6407, "step": 1229 }, { "epoch": 0.6298003072196621, "grad_norm": 0.5746563673019409, "learning_rate": 9.636588408650575e-05, "loss": 0.6125, "step": 1230 }, { "epoch": 0.6303123399897593, "grad_norm": 0.45353350043296814, "learning_rate": 9.635472853292223e-05, "loss": 0.6135, "step": 1231 }, { "epoch": 0.6308243727598566, "grad_norm": 0.5352125763893127, "learning_rate": 9.634355653132257e-05, "loss": 0.6155, "step": 1232 }, { "epoch": 0.631336405529954, "grad_norm": 0.5306152701377869, "learning_rate": 9.633236808567091e-05, "loss": 0.6357, "step": 1233 }, { "epoch": 0.6318484383000512, "grad_norm": 0.5321176052093506, "learning_rate": 9.632116319993725e-05, "loss": 0.6314, "step": 1234 }, { "epoch": 0.6323604710701485, "grad_norm": 0.5977919697761536, "learning_rate": 9.630994187809738e-05, "loss": 0.6195, "step": 1235 }, { "epoch": 0.6328725038402457, "grad_norm": 0.5664190649986267, "learning_rate": 9.629870412413301e-05, "loss": 0.6073, "step": 1236 }, { "epoch": 0.6333845366103431, "grad_norm": 0.5364221334457397, "learning_rate": 9.628744994203159e-05, "loss": 0.6385, "step": 1237 }, { "epoch": 0.6338965693804404, "grad_norm": 0.541273295879364, "learning_rate": 9.627617933578643e-05, "loss": 0.6443, "step": 1238 }, { "epoch": 0.6344086021505376, "grad_norm": 0.6627905964851379, "learning_rate": 9.626489230939669e-05, "loss": 0.6198, "step": 1239 }, { "epoch": 0.6349206349206349, "grad_norm": 0.7546166181564331, "learning_rate": 9.625358886686731e-05, "loss": 0.6237, "step": 1240 }, { "epoch": 0.6354326676907323, "grad_norm": 0.8379219770431519, "learning_rate": 9.624226901220909e-05, "loss": 0.624, "step": 1241 }, { "epoch": 0.6359447004608295, "grad_norm": 0.8538291454315186, "learning_rate": 9.623093274943869e-05, "loss": 0.6138, "step": 1242 }, { "epoch": 0.6364567332309268, "grad_norm": 0.6516668200492859, "learning_rate": 9.621958008257848e-05, "loss": 0.5985, "step": 1243 }, { "epoch": 0.636968766001024, "grad_norm": 0.690819501876831, "learning_rate": 9.620821101565678e-05, "loss": 0.6206, "step": 1244 }, { "epoch": 0.6374807987711214, "grad_norm": 0.8299631476402283, "learning_rate": 9.619682555270761e-05, "loss": 0.6381, "step": 1245 }, { "epoch": 0.6379928315412187, "grad_norm": 1.0673123598098755, "learning_rate": 9.618542369777092e-05, "loss": 0.6041, "step": 1246 }, { "epoch": 0.6385048643113159, "grad_norm": 0.9365130066871643, "learning_rate": 9.617400545489239e-05, "loss": 0.6265, "step": 1247 }, { "epoch": 0.6390168970814132, "grad_norm": 0.8756167888641357, "learning_rate": 9.616257082812357e-05, "loss": 0.6339, "step": 1248 }, { "epoch": 0.6395289298515104, "grad_norm": 0.6985016465187073, "learning_rate": 9.615111982152177e-05, "loss": 0.6034, "step": 1249 }, { "epoch": 0.6400409626216078, "grad_norm": 0.7716078758239746, "learning_rate": 9.613965243915017e-05, "loss": 0.6428, "step": 1250 }, { "epoch": 0.6405529953917051, "grad_norm": 0.8019648790359497, "learning_rate": 9.612816868507772e-05, "loss": 0.6351, "step": 1251 }, { "epoch": 0.6410650281618023, "grad_norm": 1.081592321395874, "learning_rate": 9.611666856337919e-05, "loss": 0.6261, "step": 1252 }, { "epoch": 0.6415770609318996, "grad_norm": 0.49162790179252625, "learning_rate": 9.610515207813518e-05, "loss": 0.6164, "step": 1253 }, { "epoch": 0.642089093701997, "grad_norm": 0.7312915921211243, "learning_rate": 9.609361923343206e-05, "loss": 0.6411, "step": 1254 }, { "epoch": 0.6426011264720942, "grad_norm": 0.9006524682044983, "learning_rate": 9.608207003336201e-05, "loss": 0.6269, "step": 1255 }, { "epoch": 0.6431131592421915, "grad_norm": 0.5677286386489868, "learning_rate": 9.607050448202304e-05, "loss": 0.6092, "step": 1256 }, { "epoch": 0.6436251920122887, "grad_norm": 0.5886191725730896, "learning_rate": 9.605892258351893e-05, "loss": 0.6227, "step": 1257 }, { "epoch": 0.6441372247823861, "grad_norm": 0.6648932099342346, "learning_rate": 9.604732434195932e-05, "loss": 0.6224, "step": 1258 }, { "epoch": 0.6446492575524834, "grad_norm": 0.8963104486465454, "learning_rate": 9.603570976145956e-05, "loss": 0.6175, "step": 1259 }, { "epoch": 0.6451612903225806, "grad_norm": 0.8333325982093811, "learning_rate": 9.602407884614088e-05, "loss": 0.6257, "step": 1260 }, { "epoch": 0.6456733230926779, "grad_norm": 0.720653772354126, "learning_rate": 9.601243160013023e-05, "loss": 0.6493, "step": 1261 }, { "epoch": 0.6461853558627753, "grad_norm": 0.5905517935752869, "learning_rate": 9.600076802756042e-05, "loss": 0.6386, "step": 1262 }, { "epoch": 0.6466973886328725, "grad_norm": 0.6517970561981201, "learning_rate": 9.598908813257003e-05, "loss": 0.6376, "step": 1263 }, { "epoch": 0.6472094214029698, "grad_norm": 0.7892195582389832, "learning_rate": 9.597739191930341e-05, "loss": 0.6256, "step": 1264 }, { "epoch": 0.647721454173067, "grad_norm": 0.9383836984634399, "learning_rate": 9.596567939191075e-05, "loss": 0.6321, "step": 1265 }, { "epoch": 0.6482334869431644, "grad_norm": 1.026422142982483, "learning_rate": 9.595395055454795e-05, "loss": 0.6296, "step": 1266 }, { "epoch": 0.6487455197132617, "grad_norm": 0.6764551401138306, "learning_rate": 9.594220541137679e-05, "loss": 0.6126, "step": 1267 }, { "epoch": 0.6492575524833589, "grad_norm": 0.7543627023696899, "learning_rate": 9.593044396656479e-05, "loss": 0.6408, "step": 1268 }, { "epoch": 0.6497695852534562, "grad_norm": 0.9582692384719849, "learning_rate": 9.591866622428521e-05, "loss": 0.6282, "step": 1269 }, { "epoch": 0.6502816180235536, "grad_norm": 1.0731829404830933, "learning_rate": 9.590687218871719e-05, "loss": 0.6091, "step": 1270 }, { "epoch": 0.6507936507936508, "grad_norm": 0.7802700400352478, "learning_rate": 9.589506186404556e-05, "loss": 0.6453, "step": 1271 }, { "epoch": 0.6513056835637481, "grad_norm": 0.6512326002120972, "learning_rate": 9.5883235254461e-05, "loss": 0.6315, "step": 1272 }, { "epoch": 0.6518177163338453, "grad_norm": 0.6245826482772827, "learning_rate": 9.587139236415991e-05, "loss": 0.6312, "step": 1273 }, { "epoch": 0.6523297491039427, "grad_norm": 0.552539050579071, "learning_rate": 9.585953319734453e-05, "loss": 0.6323, "step": 1274 }, { "epoch": 0.65284178187404, "grad_norm": 0.6620020866394043, "learning_rate": 9.584765775822281e-05, "loss": 0.625, "step": 1275 }, { "epoch": 0.6533538146441372, "grad_norm": 0.6738739013671875, "learning_rate": 9.58357660510085e-05, "loss": 0.6284, "step": 1276 }, { "epoch": 0.6538658474142345, "grad_norm": 0.724885106086731, "learning_rate": 9.582385807992116e-05, "loss": 0.6139, "step": 1277 }, { "epoch": 0.6543778801843319, "grad_norm": 0.8224210739135742, "learning_rate": 9.581193384918606e-05, "loss": 0.6254, "step": 1278 }, { "epoch": 0.6548899129544291, "grad_norm": 0.5068670511245728, "learning_rate": 9.579999336303427e-05, "loss": 0.6324, "step": 1279 }, { "epoch": 0.6554019457245264, "grad_norm": 0.5373448133468628, "learning_rate": 9.578803662570262e-05, "loss": 0.6171, "step": 1280 }, { "epoch": 0.6559139784946236, "grad_norm": 0.49653705954551697, "learning_rate": 9.577606364143372e-05, "loss": 0.6203, "step": 1281 }, { "epoch": 0.6564260112647209, "grad_norm": 0.5984236001968384, "learning_rate": 9.576407441447595e-05, "loss": 0.6319, "step": 1282 }, { "epoch": 0.6569380440348183, "grad_norm": 0.6266458630561829, "learning_rate": 9.57520689490834e-05, "loss": 0.6212, "step": 1283 }, { "epoch": 0.6574500768049155, "grad_norm": 0.6676628589630127, "learning_rate": 9.5740047249516e-05, "loss": 0.6123, "step": 1284 }, { "epoch": 0.6579621095750128, "grad_norm": 1.0560606718063354, "learning_rate": 9.572800932003937e-05, "loss": 0.617, "step": 1285 }, { "epoch": 0.65847414234511, "grad_norm": 0.779434084892273, "learning_rate": 9.571595516492495e-05, "loss": 0.6436, "step": 1286 }, { "epoch": 0.6589861751152074, "grad_norm": 0.6798616647720337, "learning_rate": 9.570388478844988e-05, "loss": 0.6186, "step": 1287 }, { "epoch": 0.6594982078853047, "grad_norm": 1.021231770515442, "learning_rate": 9.56917981948971e-05, "loss": 0.6198, "step": 1288 }, { "epoch": 0.6600102406554019, "grad_norm": 0.9557245373725891, "learning_rate": 9.56796953885553e-05, "loss": 0.6271, "step": 1289 }, { "epoch": 0.6605222734254992, "grad_norm": 0.4875507950782776, "learning_rate": 9.566757637371886e-05, "loss": 0.6331, "step": 1290 }, { "epoch": 0.6610343061955966, "grad_norm": 0.9399450421333313, "learning_rate": 9.565544115468802e-05, "loss": 0.6199, "step": 1291 }, { "epoch": 0.6615463389656938, "grad_norm": 1.030307650566101, "learning_rate": 9.564328973576868e-05, "loss": 0.6307, "step": 1292 }, { "epoch": 0.6620583717357911, "grad_norm": 0.7341171503067017, "learning_rate": 9.563112212127253e-05, "loss": 0.6112, "step": 1293 }, { "epoch": 0.6625704045058883, "grad_norm": 0.7321325540542603, "learning_rate": 9.561893831551699e-05, "loss": 0.6323, "step": 1294 }, { "epoch": 0.6630824372759857, "grad_norm": 0.9088945984840393, "learning_rate": 9.560673832282523e-05, "loss": 0.6213, "step": 1295 }, { "epoch": 0.663594470046083, "grad_norm": 1.0306787490844727, "learning_rate": 9.559452214752618e-05, "loss": 0.6293, "step": 1296 }, { "epoch": 0.6641065028161802, "grad_norm": 0.9592380523681641, "learning_rate": 9.558228979395447e-05, "loss": 0.6421, "step": 1297 }, { "epoch": 0.6646185355862775, "grad_norm": 0.9627156853675842, "learning_rate": 9.557004126645052e-05, "loss": 0.6048, "step": 1298 }, { "epoch": 0.6651305683563749, "grad_norm": 0.7647123336791992, "learning_rate": 9.555777656936047e-05, "loss": 0.6269, "step": 1299 }, { "epoch": 0.6656426011264721, "grad_norm": 0.7965298295021057, "learning_rate": 9.554549570703616e-05, "loss": 0.6012, "step": 1300 }, { "epoch": 0.6661546338965694, "grad_norm": 0.7146574258804321, "learning_rate": 9.553319868383526e-05, "loss": 0.6295, "step": 1301 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6624115705490112, "learning_rate": 9.552088550412106e-05, "loss": 0.591, "step": 1302 }, { "epoch": 0.667178699436764, "grad_norm": 0.6493610739707947, "learning_rate": 9.550855617226264e-05, "loss": 0.6147, "step": 1303 }, { "epoch": 0.6676907322068613, "grad_norm": 1.0314725637435913, "learning_rate": 9.549621069263483e-05, "loss": 0.6264, "step": 1304 }, { "epoch": 0.6682027649769585, "grad_norm": 0.797880232334137, "learning_rate": 9.548384906961818e-05, "loss": 0.632, "step": 1305 }, { "epoch": 0.6687147977470558, "grad_norm": 0.6843107342720032, "learning_rate": 9.547147130759893e-05, "loss": 0.628, "step": 1306 }, { "epoch": 0.6692268305171531, "grad_norm": 0.8282567858695984, "learning_rate": 9.545907741096907e-05, "loss": 0.6056, "step": 1307 }, { "epoch": 0.6697388632872504, "grad_norm": 0.8661956787109375, "learning_rate": 9.544666738412634e-05, "loss": 0.6156, "step": 1308 }, { "epoch": 0.6702508960573477, "grad_norm": 0.6663525700569153, "learning_rate": 9.543424123147415e-05, "loss": 0.6028, "step": 1309 }, { "epoch": 0.6707629288274449, "grad_norm": 0.5399653315544128, "learning_rate": 9.54217989574217e-05, "loss": 0.6009, "step": 1310 }, { "epoch": 0.6712749615975423, "grad_norm": 0.6751247644424438, "learning_rate": 9.540934056638384e-05, "loss": 0.6032, "step": 1311 }, { "epoch": 0.6717869943676396, "grad_norm": 0.5763823390007019, "learning_rate": 9.53968660627812e-05, "loss": 0.6393, "step": 1312 }, { "epoch": 0.6722990271377368, "grad_norm": 0.5795028209686279, "learning_rate": 9.538437545104007e-05, "loss": 0.6368, "step": 1313 }, { "epoch": 0.6728110599078341, "grad_norm": 0.9029579758644104, "learning_rate": 9.537186873559252e-05, "loss": 0.6174, "step": 1314 }, { "epoch": 0.6733230926779313, "grad_norm": 0.8554055690765381, "learning_rate": 9.535934592087627e-05, "loss": 0.6212, "step": 1315 }, { "epoch": 0.6738351254480287, "grad_norm": 0.5057054162025452, "learning_rate": 9.534680701133481e-05, "loss": 0.6067, "step": 1316 }, { "epoch": 0.674347158218126, "grad_norm": 0.7220867276191711, "learning_rate": 9.533425201141727e-05, "loss": 0.6208, "step": 1317 }, { "epoch": 0.6748591909882232, "grad_norm": 0.9957917928695679, "learning_rate": 9.532168092557858e-05, "loss": 0.6498, "step": 1318 }, { "epoch": 0.6753712237583205, "grad_norm": 0.8912699818611145, "learning_rate": 9.53090937582793e-05, "loss": 0.6001, "step": 1319 }, { "epoch": 0.6758832565284179, "grad_norm": 0.7167593240737915, "learning_rate": 9.529649051398573e-05, "loss": 0.6347, "step": 1320 }, { "epoch": 0.6763952892985151, "grad_norm": 0.7390733361244202, "learning_rate": 9.528387119716988e-05, "loss": 0.665, "step": 1321 }, { "epoch": 0.6769073220686124, "grad_norm": 1.1766300201416016, "learning_rate": 9.527123581230943e-05, "loss": 0.6471, "step": 1322 }, { "epoch": 0.6774193548387096, "grad_norm": 1.0111973285675049, "learning_rate": 9.525858436388782e-05, "loss": 0.6066, "step": 1323 }, { "epoch": 0.677931387608807, "grad_norm": 0.7271942496299744, "learning_rate": 9.524591685639414e-05, "loss": 0.6222, "step": 1324 }, { "epoch": 0.6784434203789043, "grad_norm": 0.9120113253593445, "learning_rate": 9.523323329432318e-05, "loss": 0.6432, "step": 1325 }, { "epoch": 0.6789554531490015, "grad_norm": 1.0131735801696777, "learning_rate": 9.522053368217545e-05, "loss": 0.6305, "step": 1326 }, { "epoch": 0.6794674859190988, "grad_norm": 0.6753901839256287, "learning_rate": 9.520781802445714e-05, "loss": 0.6235, "step": 1327 }, { "epoch": 0.6799795186891961, "grad_norm": 0.8494089245796204, "learning_rate": 9.519508632568015e-05, "loss": 0.5987, "step": 1328 }, { "epoch": 0.6804915514592934, "grad_norm": 1.0257995128631592, "learning_rate": 9.518233859036204e-05, "loss": 0.6329, "step": 1329 }, { "epoch": 0.6810035842293907, "grad_norm": 0.8629685640335083, "learning_rate": 9.51695748230261e-05, "loss": 0.5962, "step": 1330 }, { "epoch": 0.6815156169994879, "grad_norm": 0.6273880004882812, "learning_rate": 9.515679502820127e-05, "loss": 0.6333, "step": 1331 }, { "epoch": 0.6820276497695853, "grad_norm": 1.0831217765808105, "learning_rate": 9.51439992104222e-05, "loss": 0.6341, "step": 1332 }, { "epoch": 0.6825396825396826, "grad_norm": 1.0425894260406494, "learning_rate": 9.513118737422926e-05, "loss": 0.6191, "step": 1333 }, { "epoch": 0.6830517153097798, "grad_norm": 0.5631024241447449, "learning_rate": 9.51183595241684e-05, "loss": 0.6147, "step": 1334 }, { "epoch": 0.6835637480798771, "grad_norm": 0.8879140615463257, "learning_rate": 9.510551566479136e-05, "loss": 0.6212, "step": 1335 }, { "epoch": 0.6840757808499744, "grad_norm": 0.9369656443595886, "learning_rate": 9.509265580065551e-05, "loss": 0.5999, "step": 1336 }, { "epoch": 0.6845878136200717, "grad_norm": 0.9999755024909973, "learning_rate": 9.50797799363239e-05, "loss": 0.6019, "step": 1337 }, { "epoch": 0.685099846390169, "grad_norm": 0.7701225876808167, "learning_rate": 9.506688807636527e-05, "loss": 0.5827, "step": 1338 }, { "epoch": 0.6856118791602662, "grad_norm": 0.6666101813316345, "learning_rate": 9.505398022535402e-05, "loss": 0.6265, "step": 1339 }, { "epoch": 0.6861239119303636, "grad_norm": 0.830475926399231, "learning_rate": 9.504105638787024e-05, "loss": 0.6217, "step": 1340 }, { "epoch": 0.6866359447004609, "grad_norm": 0.7647290825843811, "learning_rate": 9.50281165684997e-05, "loss": 0.6442, "step": 1341 }, { "epoch": 0.6871479774705581, "grad_norm": 0.8804572820663452, "learning_rate": 9.50151607718338e-05, "loss": 0.6038, "step": 1342 }, { "epoch": 0.6876600102406554, "grad_norm": 0.6506420969963074, "learning_rate": 9.500218900246966e-05, "loss": 0.6286, "step": 1343 }, { "epoch": 0.6881720430107527, "grad_norm": 1.0107383728027344, "learning_rate": 9.498920126501002e-05, "loss": 0.6416, "step": 1344 }, { "epoch": 0.68868407578085, "grad_norm": 1.3155896663665771, "learning_rate": 9.497619756406336e-05, "loss": 0.6632, "step": 1345 }, { "epoch": 0.6891961085509473, "grad_norm": 0.8019019961357117, "learning_rate": 9.49631779042437e-05, "loss": 0.6498, "step": 1346 }, { "epoch": 0.6897081413210445, "grad_norm": 0.7264910936355591, "learning_rate": 9.495014229017085e-05, "loss": 0.6221, "step": 1347 }, { "epoch": 0.6902201740911418, "grad_norm": 0.8733853101730347, "learning_rate": 9.493709072647022e-05, "loss": 0.6108, "step": 1348 }, { "epoch": 0.6907322068612392, "grad_norm": 0.8011679649353027, "learning_rate": 9.492402321777287e-05, "loss": 0.6155, "step": 1349 }, { "epoch": 0.6912442396313364, "grad_norm": 0.7884248495101929, "learning_rate": 9.491093976871555e-05, "loss": 0.6049, "step": 1350 }, { "epoch": 0.6917562724014337, "grad_norm": 0.7755663990974426, "learning_rate": 9.489784038394066e-05, "loss": 0.6203, "step": 1351 }, { "epoch": 0.6922683051715309, "grad_norm": 0.8086382150650024, "learning_rate": 9.48847250680962e-05, "loss": 0.6453, "step": 1352 }, { "epoch": 0.6927803379416283, "grad_norm": 0.6675652265548706, "learning_rate": 9.487159382583592e-05, "loss": 0.604, "step": 1353 }, { "epoch": 0.6932923707117256, "grad_norm": 0.6121629476547241, "learning_rate": 9.485844666181915e-05, "loss": 0.6482, "step": 1354 }, { "epoch": 0.6938044034818228, "grad_norm": 0.8224285840988159, "learning_rate": 9.484528358071089e-05, "loss": 0.6171, "step": 1355 }, { "epoch": 0.6943164362519201, "grad_norm": 0.48487037420272827, "learning_rate": 9.483210458718179e-05, "loss": 0.6042, "step": 1356 }, { "epoch": 0.6948284690220174, "grad_norm": 0.5998106598854065, "learning_rate": 9.481890968590813e-05, "loss": 0.6224, "step": 1357 }, { "epoch": 0.6953405017921147, "grad_norm": 0.665101945400238, "learning_rate": 9.480569888157187e-05, "loss": 0.6143, "step": 1358 }, { "epoch": 0.695852534562212, "grad_norm": 0.6115569472312927, "learning_rate": 9.479247217886056e-05, "loss": 0.637, "step": 1359 }, { "epoch": 0.6963645673323092, "grad_norm": 0.49729758501052856, "learning_rate": 9.477922958246745e-05, "loss": 0.6233, "step": 1360 }, { "epoch": 0.6968766001024066, "grad_norm": 0.5644206404685974, "learning_rate": 9.47659710970914e-05, "loss": 0.605, "step": 1361 }, { "epoch": 0.6973886328725039, "grad_norm": 0.5353266000747681, "learning_rate": 9.475269672743687e-05, "loss": 0.638, "step": 1362 }, { "epoch": 0.6979006656426011, "grad_norm": 0.6346437335014343, "learning_rate": 9.473940647821406e-05, "loss": 0.6123, "step": 1363 }, { "epoch": 0.6984126984126984, "grad_norm": 0.4797613024711609, "learning_rate": 9.47261003541387e-05, "loss": 0.6174, "step": 1364 }, { "epoch": 0.6989247311827957, "grad_norm": 0.512795090675354, "learning_rate": 9.471277835993217e-05, "loss": 0.6347, "step": 1365 }, { "epoch": 0.699436763952893, "grad_norm": 0.6752040982246399, "learning_rate": 9.469944050032156e-05, "loss": 0.6417, "step": 1366 }, { "epoch": 0.6999487967229903, "grad_norm": 0.807709276676178, "learning_rate": 9.46860867800395e-05, "loss": 0.6072, "step": 1367 }, { "epoch": 0.7004608294930875, "grad_norm": 0.6822542548179626, "learning_rate": 9.467271720382427e-05, "loss": 0.6108, "step": 1368 }, { "epoch": 0.7009728622631849, "grad_norm": 0.5755512714385986, "learning_rate": 9.465933177641982e-05, "loss": 0.6221, "step": 1369 }, { "epoch": 0.7014848950332822, "grad_norm": 0.6674950122833252, "learning_rate": 9.464593050257568e-05, "loss": 0.6236, "step": 1370 }, { "epoch": 0.7019969278033794, "grad_norm": 0.8138725757598877, "learning_rate": 9.4632513387047e-05, "loss": 0.6269, "step": 1371 }, { "epoch": 0.7025089605734767, "grad_norm": 0.9213794469833374, "learning_rate": 9.461908043459458e-05, "loss": 0.6151, "step": 1372 }, { "epoch": 0.703020993343574, "grad_norm": 0.6001470685005188, "learning_rate": 9.460563164998482e-05, "loss": 0.6279, "step": 1373 }, { "epoch": 0.7035330261136713, "grad_norm": 0.7958489060401917, "learning_rate": 9.459216703798974e-05, "loss": 0.6195, "step": 1374 }, { "epoch": 0.7040450588837686, "grad_norm": 0.8188454508781433, "learning_rate": 9.457868660338701e-05, "loss": 0.6196, "step": 1375 }, { "epoch": 0.7045570916538658, "grad_norm": 0.7665884494781494, "learning_rate": 9.456519035095982e-05, "loss": 0.6055, "step": 1376 }, { "epoch": 0.7050691244239631, "grad_norm": 0.8974027037620544, "learning_rate": 9.455167828549709e-05, "loss": 0.6147, "step": 1377 }, { "epoch": 0.7055811571940604, "grad_norm": 0.9711163640022278, "learning_rate": 9.453815041179329e-05, "loss": 0.6312, "step": 1378 }, { "epoch": 0.7060931899641577, "grad_norm": 0.8727755546569824, "learning_rate": 9.452460673464846e-05, "loss": 0.6182, "step": 1379 }, { "epoch": 0.706605222734255, "grad_norm": 0.7593328952789307, "learning_rate": 9.451104725886837e-05, "loss": 0.6104, "step": 1380 }, { "epoch": 0.7071172555043522, "grad_norm": 0.630299985408783, "learning_rate": 9.449747198926426e-05, "loss": 0.6146, "step": 1381 }, { "epoch": 0.7076292882744496, "grad_norm": 0.8935937285423279, "learning_rate": 9.448388093065306e-05, "loss": 0.6521, "step": 1382 }, { "epoch": 0.7081413210445469, "grad_norm": 0.8064360022544861, "learning_rate": 9.447027408785725e-05, "loss": 0.6275, "step": 1383 }, { "epoch": 0.7086533538146441, "grad_norm": 0.46093374490737915, "learning_rate": 9.445665146570497e-05, "loss": 0.6266, "step": 1384 }, { "epoch": 0.7091653865847414, "grad_norm": 0.7555334568023682, "learning_rate": 9.44430130690299e-05, "loss": 0.6189, "step": 1385 }, { "epoch": 0.7096774193548387, "grad_norm": 0.862582266330719, "learning_rate": 9.442935890267136e-05, "loss": 0.5996, "step": 1386 }, { "epoch": 0.710189452124936, "grad_norm": 0.6706003546714783, "learning_rate": 9.441568897147424e-05, "loss": 0.642, "step": 1387 }, { "epoch": 0.7107014848950333, "grad_norm": 0.6213183999061584, "learning_rate": 9.440200328028903e-05, "loss": 0.6096, "step": 1388 }, { "epoch": 0.7112135176651305, "grad_norm": 0.7055994272232056, "learning_rate": 9.438830183397183e-05, "loss": 0.5915, "step": 1389 }, { "epoch": 0.7117255504352279, "grad_norm": 0.7422837018966675, "learning_rate": 9.43745846373843e-05, "loss": 0.6222, "step": 1390 }, { "epoch": 0.7122375832053252, "grad_norm": 0.5505529046058655, "learning_rate": 9.436085169539368e-05, "loss": 0.6229, "step": 1391 }, { "epoch": 0.7127496159754224, "grad_norm": 0.4774070382118225, "learning_rate": 9.434710301287287e-05, "loss": 0.6214, "step": 1392 }, { "epoch": 0.7132616487455197, "grad_norm": 0.5182769298553467, "learning_rate": 9.433333859470028e-05, "loss": 0.6217, "step": 1393 }, { "epoch": 0.713773681515617, "grad_norm": 0.5018037557601929, "learning_rate": 9.431955844575992e-05, "loss": 0.5899, "step": 1394 }, { "epoch": 0.7142857142857143, "grad_norm": 0.5400078296661377, "learning_rate": 9.430576257094141e-05, "loss": 0.6299, "step": 1395 }, { "epoch": 0.7147977470558116, "grad_norm": 0.6775665283203125, "learning_rate": 9.429195097513993e-05, "loss": 0.6313, "step": 1396 }, { "epoch": 0.7153097798259088, "grad_norm": 0.7176981568336487, "learning_rate": 9.427812366325621e-05, "loss": 0.6229, "step": 1397 }, { "epoch": 0.7158218125960062, "grad_norm": 0.7125338315963745, "learning_rate": 9.426428064019662e-05, "loss": 0.6099, "step": 1398 }, { "epoch": 0.7163338453661034, "grad_norm": 0.6778757572174072, "learning_rate": 9.425042191087306e-05, "loss": 0.601, "step": 1399 }, { "epoch": 0.7168458781362007, "grad_norm": 0.737187385559082, "learning_rate": 9.4236547480203e-05, "loss": 0.6281, "step": 1400 }, { "epoch": 0.717357910906298, "grad_norm": 0.7879711389541626, "learning_rate": 9.42226573531095e-05, "loss": 0.6215, "step": 1401 }, { "epoch": 0.7178699436763953, "grad_norm": 0.5530551671981812, "learning_rate": 9.42087515345212e-05, "loss": 0.6129, "step": 1402 }, { "epoch": 0.7183819764464926, "grad_norm": 0.5459476709365845, "learning_rate": 9.419483002937229e-05, "loss": 0.6134, "step": 1403 }, { "epoch": 0.7188940092165899, "grad_norm": 0.6212730407714844, "learning_rate": 9.41808928426025e-05, "loss": 0.6508, "step": 1404 }, { "epoch": 0.7194060419866871, "grad_norm": 0.6483210325241089, "learning_rate": 9.416693997915718e-05, "loss": 0.6262, "step": 1405 }, { "epoch": 0.7199180747567845, "grad_norm": 0.5651390552520752, "learning_rate": 9.41529714439872e-05, "loss": 0.615, "step": 1406 }, { "epoch": 0.7204301075268817, "grad_norm": 0.5541356801986694, "learning_rate": 9.4138987242049e-05, "loss": 0.6175, "step": 1407 }, { "epoch": 0.720942140296979, "grad_norm": 0.5940544009208679, "learning_rate": 9.412498737830461e-05, "loss": 0.627, "step": 1408 }, { "epoch": 0.7214541730670763, "grad_norm": 0.5895199775695801, "learning_rate": 9.411097185772157e-05, "loss": 0.6128, "step": 1409 }, { "epoch": 0.7219662058371735, "grad_norm": 0.599393367767334, "learning_rate": 9.409694068527302e-05, "loss": 0.6272, "step": 1410 }, { "epoch": 0.7224782386072709, "grad_norm": 0.5696869492530823, "learning_rate": 9.40828938659376e-05, "loss": 0.6279, "step": 1411 }, { "epoch": 0.7229902713773682, "grad_norm": 0.852377712726593, "learning_rate": 9.406883140469957e-05, "loss": 0.6074, "step": 1412 }, { "epoch": 0.7235023041474654, "grad_norm": 0.8876937031745911, "learning_rate": 9.405475330654867e-05, "loss": 0.6162, "step": 1413 }, { "epoch": 0.7240143369175627, "grad_norm": 0.8141427040100098, "learning_rate": 9.404065957648023e-05, "loss": 0.6328, "step": 1414 }, { "epoch": 0.72452636968766, "grad_norm": 0.7472403049468994, "learning_rate": 9.402655021949514e-05, "loss": 0.6143, "step": 1415 }, { "epoch": 0.7250384024577573, "grad_norm": 0.7278257608413696, "learning_rate": 9.401242524059978e-05, "loss": 0.6302, "step": 1416 }, { "epoch": 0.7255504352278546, "grad_norm": 0.686924397945404, "learning_rate": 9.399828464480613e-05, "loss": 0.6352, "step": 1417 }, { "epoch": 0.7260624679979518, "grad_norm": 0.803595244884491, "learning_rate": 9.398412843713167e-05, "loss": 0.6147, "step": 1418 }, { "epoch": 0.7265745007680492, "grad_norm": 0.8225151896476746, "learning_rate": 9.396995662259946e-05, "loss": 0.6107, "step": 1419 }, { "epoch": 0.7270865335381465, "grad_norm": 0.7580759525299072, "learning_rate": 9.395576920623807e-05, "loss": 0.6313, "step": 1420 }, { "epoch": 0.7275985663082437, "grad_norm": 1.0862232446670532, "learning_rate": 9.394156619308158e-05, "loss": 0.6142, "step": 1421 }, { "epoch": 0.728110599078341, "grad_norm": 0.8804206252098083, "learning_rate": 9.392734758816968e-05, "loss": 0.6179, "step": 1422 }, { "epoch": 0.7286226318484383, "grad_norm": 0.7671944499015808, "learning_rate": 9.391311339654753e-05, "loss": 0.6124, "step": 1423 }, { "epoch": 0.7291346646185356, "grad_norm": 1.200302004814148, "learning_rate": 9.389886362326582e-05, "loss": 0.6163, "step": 1424 }, { "epoch": 0.7296466973886329, "grad_norm": 0.8192538619041443, "learning_rate": 9.388459827338083e-05, "loss": 0.6243, "step": 1425 }, { "epoch": 0.7301587301587301, "grad_norm": 0.813095211982727, "learning_rate": 9.387031735195428e-05, "loss": 0.6261, "step": 1426 }, { "epoch": 0.7306707629288275, "grad_norm": 0.9510880708694458, "learning_rate": 9.385602086405348e-05, "loss": 0.6197, "step": 1427 }, { "epoch": 0.7311827956989247, "grad_norm": 0.9546497464179993, "learning_rate": 9.384170881475122e-05, "loss": 0.6372, "step": 1428 }, { "epoch": 0.731694828469022, "grad_norm": 0.8568170666694641, "learning_rate": 9.382738120912587e-05, "loss": 0.6219, "step": 1429 }, { "epoch": 0.7322068612391193, "grad_norm": 0.6787075400352478, "learning_rate": 9.381303805226127e-05, "loss": 0.5995, "step": 1430 }, { "epoch": 0.7327188940092166, "grad_norm": 0.67671799659729, "learning_rate": 9.379867934924679e-05, "loss": 0.6362, "step": 1431 }, { "epoch": 0.7332309267793139, "grad_norm": 0.7477825880050659, "learning_rate": 9.378430510517731e-05, "loss": 0.6116, "step": 1432 }, { "epoch": 0.7337429595494112, "grad_norm": 0.47436821460723877, "learning_rate": 9.376991532515324e-05, "loss": 0.6119, "step": 1433 }, { "epoch": 0.7342549923195084, "grad_norm": 0.6497752666473389, "learning_rate": 9.375551001428052e-05, "loss": 0.6067, "step": 1434 }, { "epoch": 0.7347670250896058, "grad_norm": 0.7237414717674255, "learning_rate": 9.374108917767055e-05, "loss": 0.6557, "step": 1435 }, { "epoch": 0.735279057859703, "grad_norm": 0.6366404294967651, "learning_rate": 9.372665282044025e-05, "loss": 0.6333, "step": 1436 }, { "epoch": 0.7357910906298003, "grad_norm": 0.7928580045700073, "learning_rate": 9.37122009477121e-05, "loss": 0.6029, "step": 1437 }, { "epoch": 0.7363031233998976, "grad_norm": 0.6368914246559143, "learning_rate": 9.369773356461402e-05, "loss": 0.6159, "step": 1438 }, { "epoch": 0.7368151561699949, "grad_norm": 0.755048930644989, "learning_rate": 9.368325067627948e-05, "loss": 0.6325, "step": 1439 }, { "epoch": 0.7373271889400922, "grad_norm": 0.8012481927871704, "learning_rate": 9.366875228784743e-05, "loss": 0.6169, "step": 1440 }, { "epoch": 0.7378392217101895, "grad_norm": 0.7609354853630066, "learning_rate": 9.36542384044623e-05, "loss": 0.6239, "step": 1441 }, { "epoch": 0.7383512544802867, "grad_norm": 1.0283498764038086, "learning_rate": 9.363970903127408e-05, "loss": 0.6316, "step": 1442 }, { "epoch": 0.738863287250384, "grad_norm": 0.7834647297859192, "learning_rate": 9.362516417343819e-05, "loss": 0.6168, "step": 1443 }, { "epoch": 0.7393753200204813, "grad_norm": 0.9822630882263184, "learning_rate": 9.361060383611557e-05, "loss": 0.617, "step": 1444 }, { "epoch": 0.7398873527905786, "grad_norm": 0.7609227895736694, "learning_rate": 9.359602802447267e-05, "loss": 0.6514, "step": 1445 }, { "epoch": 0.7403993855606759, "grad_norm": 0.6444324254989624, "learning_rate": 9.35814367436814e-05, "loss": 0.6005, "step": 1446 }, { "epoch": 0.7409114183307731, "grad_norm": 0.9155465364456177, "learning_rate": 9.35668299989192e-05, "loss": 0.6278, "step": 1447 }, { "epoch": 0.7414234511008705, "grad_norm": 0.960804283618927, "learning_rate": 9.355220779536894e-05, "loss": 0.6002, "step": 1448 }, { "epoch": 0.7419354838709677, "grad_norm": 0.7269085049629211, "learning_rate": 9.353757013821902e-05, "loss": 0.6455, "step": 1449 }, { "epoch": 0.742447516641065, "grad_norm": 0.718574583530426, "learning_rate": 9.352291703266331e-05, "loss": 0.62, "step": 1450 }, { "epoch": 0.7429595494111623, "grad_norm": 0.7170273661613464, "learning_rate": 9.350824848390117e-05, "loss": 0.6162, "step": 1451 }, { "epoch": 0.7434715821812596, "grad_norm": 0.7386274933815002, "learning_rate": 9.349356449713741e-05, "loss": 0.6107, "step": 1452 }, { "epoch": 0.7439836149513569, "grad_norm": 0.6198958158493042, "learning_rate": 9.347886507758235e-05, "loss": 0.6203, "step": 1453 }, { "epoch": 0.7444956477214542, "grad_norm": 0.6816853880882263, "learning_rate": 9.346415023045179e-05, "loss": 0.6069, "step": 1454 }, { "epoch": 0.7450076804915514, "grad_norm": 0.59698086977005, "learning_rate": 9.344941996096697e-05, "loss": 0.6163, "step": 1455 }, { "epoch": 0.7455197132616488, "grad_norm": 0.6464901566505432, "learning_rate": 9.343467427435461e-05, "loss": 0.6126, "step": 1456 }, { "epoch": 0.746031746031746, "grad_norm": 0.5961935520172119, "learning_rate": 9.341991317584695e-05, "loss": 0.6096, "step": 1457 }, { "epoch": 0.7465437788018433, "grad_norm": 0.6468561887741089, "learning_rate": 9.340513667068161e-05, "loss": 0.6448, "step": 1458 }, { "epoch": 0.7470558115719406, "grad_norm": 0.7821601629257202, "learning_rate": 9.339034476410177e-05, "loss": 0.6116, "step": 1459 }, { "epoch": 0.7475678443420379, "grad_norm": 0.8335040807723999, "learning_rate": 9.337553746135604e-05, "loss": 0.6155, "step": 1460 }, { "epoch": 0.7480798771121352, "grad_norm": 0.5850638151168823, "learning_rate": 9.336071476769843e-05, "loss": 0.6139, "step": 1461 }, { "epoch": 0.7485919098822325, "grad_norm": 0.604335367679596, "learning_rate": 9.33458766883885e-05, "loss": 0.6257, "step": 1462 }, { "epoch": 0.7491039426523297, "grad_norm": 0.6160181760787964, "learning_rate": 9.333102322869123e-05, "loss": 0.6165, "step": 1463 }, { "epoch": 0.7496159754224271, "grad_norm": 0.5766586661338806, "learning_rate": 9.331615439387709e-05, "loss": 0.6202, "step": 1464 }, { "epoch": 0.7501280081925243, "grad_norm": 0.5597929954528809, "learning_rate": 9.330127018922194e-05, "loss": 0.6227, "step": 1465 }, { "epoch": 0.7506400409626216, "grad_norm": 0.6532039046287537, "learning_rate": 9.328637062000715e-05, "loss": 0.6019, "step": 1466 }, { "epoch": 0.7511520737327189, "grad_norm": 0.4501531720161438, "learning_rate": 9.327145569151952e-05, "loss": 0.6003, "step": 1467 }, { "epoch": 0.7516641065028162, "grad_norm": 0.6128273606300354, "learning_rate": 9.32565254090513e-05, "loss": 0.5931, "step": 1468 }, { "epoch": 0.7521761392729135, "grad_norm": 0.7237525582313538, "learning_rate": 9.324157977790018e-05, "loss": 0.6433, "step": 1469 }, { "epoch": 0.7526881720430108, "grad_norm": 0.5612293481826782, "learning_rate": 9.322661880336935e-05, "loss": 0.6239, "step": 1470 }, { "epoch": 0.753200204813108, "grad_norm": 0.5488108396530151, "learning_rate": 9.321164249076737e-05, "loss": 0.6198, "step": 1471 }, { "epoch": 0.7537122375832054, "grad_norm": 0.5466194152832031, "learning_rate": 9.319665084540827e-05, "loss": 0.5922, "step": 1472 }, { "epoch": 0.7542242703533026, "grad_norm": 0.4916110634803772, "learning_rate": 9.318164387261155e-05, "loss": 0.6195, "step": 1473 }, { "epoch": 0.7547363031233999, "grad_norm": 0.6376786231994629, "learning_rate": 9.316662157770208e-05, "loss": 0.6203, "step": 1474 }, { "epoch": 0.7552483358934972, "grad_norm": 0.7546161413192749, "learning_rate": 9.315158396601028e-05, "loss": 0.6, "step": 1475 }, { "epoch": 0.7557603686635944, "grad_norm": 0.7149611115455627, "learning_rate": 9.313653104287186e-05, "loss": 0.6232, "step": 1476 }, { "epoch": 0.7562724014336918, "grad_norm": 0.5330854654312134, "learning_rate": 9.31214628136281e-05, "loss": 0.6151, "step": 1477 }, { "epoch": 0.756784434203789, "grad_norm": 0.5114818811416626, "learning_rate": 9.31063792836256e-05, "loss": 0.6179, "step": 1478 }, { "epoch": 0.7572964669738863, "grad_norm": 0.589394211769104, "learning_rate": 9.309128045821647e-05, "loss": 0.6076, "step": 1479 }, { "epoch": 0.7578084997439836, "grad_norm": 0.45736587047576904, "learning_rate": 9.307616634275821e-05, "loss": 0.6203, "step": 1480 }, { "epoch": 0.7583205325140809, "grad_norm": 0.5717702507972717, "learning_rate": 9.306103694261376e-05, "loss": 0.6285, "step": 1481 }, { "epoch": 0.7588325652841782, "grad_norm": 0.8159053921699524, "learning_rate": 9.304589226315144e-05, "loss": 0.5927, "step": 1482 }, { "epoch": 0.7593445980542755, "grad_norm": 0.6956404447555542, "learning_rate": 9.303073230974507e-05, "loss": 0.6082, "step": 1483 }, { "epoch": 0.7598566308243727, "grad_norm": 0.6816420555114746, "learning_rate": 9.301555708777381e-05, "loss": 0.5911, "step": 1484 }, { "epoch": 0.7603686635944701, "grad_norm": 0.8344772458076477, "learning_rate": 9.300036660262228e-05, "loss": 0.5758, "step": 1485 }, { "epoch": 0.7608806963645673, "grad_norm": 0.8542769551277161, "learning_rate": 9.298516085968052e-05, "loss": 0.6002, "step": 1486 }, { "epoch": 0.7613927291346646, "grad_norm": 0.6624354720115662, "learning_rate": 9.2969939864344e-05, "loss": 0.6247, "step": 1487 }, { "epoch": 0.7619047619047619, "grad_norm": 0.5258535742759705, "learning_rate": 9.295470362201351e-05, "loss": 0.6123, "step": 1488 }, { "epoch": 0.7624167946748592, "grad_norm": 0.5287208557128906, "learning_rate": 9.293945213809537e-05, "loss": 0.6149, "step": 1489 }, { "epoch": 0.7629288274449565, "grad_norm": 0.5423457026481628, "learning_rate": 9.292418541800122e-05, "loss": 0.632, "step": 1490 }, { "epoch": 0.7634408602150538, "grad_norm": 0.6506828665733337, "learning_rate": 9.290890346714815e-05, "loss": 0.6275, "step": 1491 }, { "epoch": 0.763952892985151, "grad_norm": 0.900181770324707, "learning_rate": 9.289360629095867e-05, "loss": 0.6007, "step": 1492 }, { "epoch": 0.7644649257552484, "grad_norm": 1.049805998802185, "learning_rate": 9.287829389486064e-05, "loss": 0.5949, "step": 1493 }, { "epoch": 0.7649769585253456, "grad_norm": 0.7891620993614197, "learning_rate": 9.286296628428735e-05, "loss": 0.5983, "step": 1494 }, { "epoch": 0.7654889912954429, "grad_norm": 0.7216867208480835, "learning_rate": 9.284762346467748e-05, "loss": 0.6119, "step": 1495 }, { "epoch": 0.7660010240655402, "grad_norm": 1.128071665763855, "learning_rate": 9.283226544147512e-05, "loss": 0.6108, "step": 1496 }, { "epoch": 0.7665130568356375, "grad_norm": 0.83089280128479, "learning_rate": 9.281689222012974e-05, "loss": 0.5886, "step": 1497 }, { "epoch": 0.7670250896057348, "grad_norm": 0.9412809610366821, "learning_rate": 9.280150380609623e-05, "loss": 0.6304, "step": 1498 }, { "epoch": 0.767537122375832, "grad_norm": 1.076553463935852, "learning_rate": 9.278610020483483e-05, "loss": 0.6332, "step": 1499 }, { "epoch": 0.7680491551459293, "grad_norm": 0.747258722782135, "learning_rate": 9.277068142181118e-05, "loss": 0.6062, "step": 1500 }, { "epoch": 0.7685611879160267, "grad_norm": 0.9373590350151062, "learning_rate": 9.275524746249632e-05, "loss": 0.6076, "step": 1501 }, { "epoch": 0.7690732206861239, "grad_norm": 1.5187216997146606, "learning_rate": 9.273979833236672e-05, "loss": 0.6333, "step": 1502 }, { "epoch": 0.7695852534562212, "grad_norm": 0.6018044352531433, "learning_rate": 9.272433403690412e-05, "loss": 0.6062, "step": 1503 }, { "epoch": 0.7700972862263185, "grad_norm": 1.4789425134658813, "learning_rate": 9.270885458159575e-05, "loss": 0.6209, "step": 1504 }, { "epoch": 0.7706093189964157, "grad_norm": 0.6630963087081909, "learning_rate": 9.269335997193414e-05, "loss": 0.6138, "step": 1505 }, { "epoch": 0.7711213517665131, "grad_norm": 1.7096540927886963, "learning_rate": 9.267785021341725e-05, "loss": 0.6284, "step": 1506 }, { "epoch": 0.7716333845366103, "grad_norm": 0.8251217603683472, "learning_rate": 9.26623253115484e-05, "loss": 0.616, "step": 1507 }, { "epoch": 0.7721454173067076, "grad_norm": 1.4139816761016846, "learning_rate": 9.264678527183625e-05, "loss": 0.6323, "step": 1508 }, { "epoch": 0.7726574500768049, "grad_norm": 0.8918523192405701, "learning_rate": 9.26312300997949e-05, "loss": 0.6312, "step": 1509 }, { "epoch": 0.7731694828469022, "grad_norm": 1.3293424844741821, "learning_rate": 9.261565980094377e-05, "loss": 0.6162, "step": 1510 }, { "epoch": 0.7736815156169995, "grad_norm": 1.074052333831787, "learning_rate": 9.260007438080767e-05, "loss": 0.6251, "step": 1511 }, { "epoch": 0.7741935483870968, "grad_norm": 1.177394986152649, "learning_rate": 9.258447384491673e-05, "loss": 0.6166, "step": 1512 }, { "epoch": 0.774705581157194, "grad_norm": 1.0398776531219482, "learning_rate": 9.256885819880649e-05, "loss": 0.6185, "step": 1513 }, { "epoch": 0.7752176139272914, "grad_norm": 1.061400294303894, "learning_rate": 9.255322744801786e-05, "loss": 0.5982, "step": 1514 }, { "epoch": 0.7757296466973886, "grad_norm": 1.0271527767181396, "learning_rate": 9.253758159809707e-05, "loss": 0.6269, "step": 1515 }, { "epoch": 0.7762416794674859, "grad_norm": 1.1519606113433838, "learning_rate": 9.252192065459572e-05, "loss": 0.6136, "step": 1516 }, { "epoch": 0.7767537122375832, "grad_norm": 0.9607214331626892, "learning_rate": 9.250624462307078e-05, "loss": 0.6206, "step": 1517 }, { "epoch": 0.7772657450076805, "grad_norm": 1.1265422105789185, "learning_rate": 9.249055350908457e-05, "loss": 0.6314, "step": 1518 }, { "epoch": 0.7777777777777778, "grad_norm": 0.8872023224830627, "learning_rate": 9.247484731820474e-05, "loss": 0.6014, "step": 1519 }, { "epoch": 0.778289810547875, "grad_norm": 0.9238242506980896, "learning_rate": 9.245912605600436e-05, "loss": 0.6193, "step": 1520 }, { "epoch": 0.7788018433179723, "grad_norm": 0.9310383796691895, "learning_rate": 9.244338972806172e-05, "loss": 0.578, "step": 1521 }, { "epoch": 0.7793138760880697, "grad_norm": 0.5902627110481262, "learning_rate": 9.242763833996058e-05, "loss": 0.596, "step": 1522 }, { "epoch": 0.7798259088581669, "grad_norm": 1.0053119659423828, "learning_rate": 9.241187189728996e-05, "loss": 0.6117, "step": 1523 }, { "epoch": 0.7803379416282642, "grad_norm": 0.9190763831138611, "learning_rate": 9.23960904056443e-05, "loss": 0.6385, "step": 1524 }, { "epoch": 0.7808499743983615, "grad_norm": 1.1116129159927368, "learning_rate": 9.238029387062328e-05, "loss": 0.6375, "step": 1525 }, { "epoch": 0.7813620071684588, "grad_norm": 0.9089121222496033, "learning_rate": 9.236448229783203e-05, "loss": 0.6148, "step": 1526 }, { "epoch": 0.7818740399385561, "grad_norm": 0.8957609534263611, "learning_rate": 9.234865569288092e-05, "loss": 0.6168, "step": 1527 }, { "epoch": 0.7823860727086533, "grad_norm": 0.7927278280258179, "learning_rate": 9.233281406138572e-05, "loss": 0.6118, "step": 1528 }, { "epoch": 0.7828981054787506, "grad_norm": 0.8030017018318176, "learning_rate": 9.231695740896748e-05, "loss": 0.6067, "step": 1529 }, { "epoch": 0.783410138248848, "grad_norm": 0.7627915143966675, "learning_rate": 9.230108574125262e-05, "loss": 0.6255, "step": 1530 }, { "epoch": 0.7839221710189452, "grad_norm": 0.8994153141975403, "learning_rate": 9.228519906387288e-05, "loss": 0.631, "step": 1531 }, { "epoch": 0.7844342037890425, "grad_norm": 1.0453933477401733, "learning_rate": 9.226929738246529e-05, "loss": 0.638, "step": 1532 }, { "epoch": 0.7849462365591398, "grad_norm": 0.996675431728363, "learning_rate": 9.225338070267227e-05, "loss": 0.6184, "step": 1533 }, { "epoch": 0.7854582693292371, "grad_norm": 0.6681482195854187, "learning_rate": 9.223744903014148e-05, "loss": 0.6137, "step": 1534 }, { "epoch": 0.7859703020993344, "grad_norm": 0.7777343392372131, "learning_rate": 9.2221502370526e-05, "loss": 0.6471, "step": 1535 }, { "epoch": 0.7864823348694316, "grad_norm": 0.6889387965202332, "learning_rate": 9.220554072948411e-05, "loss": 0.5903, "step": 1536 }, { "epoch": 0.7869943676395289, "grad_norm": 0.8464424014091492, "learning_rate": 9.218956411267952e-05, "loss": 0.6332, "step": 1537 }, { "epoch": 0.7875064004096262, "grad_norm": 0.8166050910949707, "learning_rate": 9.217357252578118e-05, "loss": 0.6226, "step": 1538 }, { "epoch": 0.7880184331797235, "grad_norm": 0.6117397546768188, "learning_rate": 9.215756597446337e-05, "loss": 0.5978, "step": 1539 }, { "epoch": 0.7885304659498208, "grad_norm": 0.7217066884040833, "learning_rate": 9.21415444644057e-05, "loss": 0.612, "step": 1540 }, { "epoch": 0.789042498719918, "grad_norm": 0.9874158501625061, "learning_rate": 9.212550800129305e-05, "loss": 0.6141, "step": 1541 }, { "epoch": 0.7895545314900153, "grad_norm": 0.9933454394340515, "learning_rate": 9.210945659081565e-05, "loss": 0.6391, "step": 1542 }, { "epoch": 0.7900665642601127, "grad_norm": 0.5571685433387756, "learning_rate": 9.209339023866899e-05, "loss": 0.6006, "step": 1543 }, { "epoch": 0.7905785970302099, "grad_norm": 0.8368340134620667, "learning_rate": 9.207730895055388e-05, "loss": 0.6238, "step": 1544 }, { "epoch": 0.7910906298003072, "grad_norm": 0.9571197628974915, "learning_rate": 9.206121273217646e-05, "loss": 0.616, "step": 1545 }, { "epoch": 0.7916026625704045, "grad_norm": 0.7998978495597839, "learning_rate": 9.204510158924811e-05, "loss": 0.6006, "step": 1546 }, { "epoch": 0.7921146953405018, "grad_norm": 0.8310467600822449, "learning_rate": 9.202897552748555e-05, "loss": 0.6147, "step": 1547 }, { "epoch": 0.7926267281105991, "grad_norm": 0.9382485747337341, "learning_rate": 9.201283455261079e-05, "loss": 0.613, "step": 1548 }, { "epoch": 0.7931387608806963, "grad_norm": 0.5525487661361694, "learning_rate": 9.19966786703511e-05, "loss": 0.6338, "step": 1549 }, { "epoch": 0.7936507936507936, "grad_norm": 0.746207594871521, "learning_rate": 9.198050788643907e-05, "loss": 0.6165, "step": 1550 }, { "epoch": 0.794162826420891, "grad_norm": 0.8524749875068665, "learning_rate": 9.196432220661258e-05, "loss": 0.6209, "step": 1551 }, { "epoch": 0.7946748591909882, "grad_norm": 0.6705928444862366, "learning_rate": 9.194812163661474e-05, "loss": 0.6095, "step": 1552 }, { "epoch": 0.7951868919610855, "grad_norm": 0.6570793390274048, "learning_rate": 9.193190618219405e-05, "loss": 0.6114, "step": 1553 }, { "epoch": 0.7956989247311828, "grad_norm": 0.7881936430931091, "learning_rate": 9.191567584910418e-05, "loss": 0.6386, "step": 1554 }, { "epoch": 0.7962109575012801, "grad_norm": 0.675227165222168, "learning_rate": 9.189943064310415e-05, "loss": 0.6121, "step": 1555 }, { "epoch": 0.7967229902713774, "grad_norm": 0.5703387260437012, "learning_rate": 9.188317056995822e-05, "loss": 0.596, "step": 1556 }, { "epoch": 0.7972350230414746, "grad_norm": 0.5082479119300842, "learning_rate": 9.186689563543595e-05, "loss": 0.6151, "step": 1557 }, { "epoch": 0.7977470558115719, "grad_norm": 0.7303303480148315, "learning_rate": 9.185060584531217e-05, "loss": 0.593, "step": 1558 }, { "epoch": 0.7982590885816693, "grad_norm": 0.586883544921875, "learning_rate": 9.183430120536698e-05, "loss": 0.6193, "step": 1559 }, { "epoch": 0.7987711213517665, "grad_norm": 0.5144060850143433, "learning_rate": 9.181798172138572e-05, "loss": 0.6022, "step": 1560 }, { "epoch": 0.7992831541218638, "grad_norm": 0.7321364879608154, "learning_rate": 9.180164739915905e-05, "loss": 0.6498, "step": 1561 }, { "epoch": 0.799795186891961, "grad_norm": 0.7396813631057739, "learning_rate": 9.178529824448282e-05, "loss": 0.6167, "step": 1562 }, { "epoch": 0.8003072196620584, "grad_norm": 0.49853086471557617, "learning_rate": 9.176893426315823e-05, "loss": 0.5819, "step": 1563 }, { "epoch": 0.8008192524321557, "grad_norm": 0.6689085364341736, "learning_rate": 9.175255546099171e-05, "loss": 0.6081, "step": 1564 }, { "epoch": 0.8013312852022529, "grad_norm": 0.8793342113494873, "learning_rate": 9.17361618437949e-05, "loss": 0.6052, "step": 1565 }, { "epoch": 0.8018433179723502, "grad_norm": 0.7177857160568237, "learning_rate": 9.171975341738475e-05, "loss": 0.632, "step": 1566 }, { "epoch": 0.8023553507424476, "grad_norm": 0.48787227272987366, "learning_rate": 9.170333018758346e-05, "loss": 0.6279, "step": 1567 }, { "epoch": 0.8028673835125448, "grad_norm": 0.5946686267852783, "learning_rate": 9.168689216021847e-05, "loss": 0.6348, "step": 1568 }, { "epoch": 0.8033794162826421, "grad_norm": 0.7953499555587769, "learning_rate": 9.167043934112248e-05, "loss": 0.6236, "step": 1569 }, { "epoch": 0.8038914490527393, "grad_norm": 0.6520671248435974, "learning_rate": 9.16539717361334e-05, "loss": 0.6175, "step": 1570 }, { "epoch": 0.8044034818228366, "grad_norm": 0.6093055009841919, "learning_rate": 9.163748935109447e-05, "loss": 0.6337, "step": 1571 }, { "epoch": 0.804915514592934, "grad_norm": 0.7484267354011536, "learning_rate": 9.162099219185407e-05, "loss": 0.6264, "step": 1572 }, { "epoch": 0.8054275473630312, "grad_norm": 0.8127752542495728, "learning_rate": 9.160448026426591e-05, "loss": 0.6041, "step": 1573 }, { "epoch": 0.8059395801331285, "grad_norm": 0.7575063109397888, "learning_rate": 9.15879535741889e-05, "loss": 0.5993, "step": 1574 }, { "epoch": 0.8064516129032258, "grad_norm": 0.6322052478790283, "learning_rate": 9.157141212748722e-05, "loss": 0.6253, "step": 1575 }, { "epoch": 0.8069636456733231, "grad_norm": 0.6203233003616333, "learning_rate": 9.155485593003019e-05, "loss": 0.6348, "step": 1576 }, { "epoch": 0.8074756784434204, "grad_norm": 0.623991072177887, "learning_rate": 9.153828498769249e-05, "loss": 0.6055, "step": 1577 }, { "epoch": 0.8079877112135176, "grad_norm": 0.5187473893165588, "learning_rate": 9.152169930635397e-05, "loss": 0.6097, "step": 1578 }, { "epoch": 0.8084997439836149, "grad_norm": 0.5868754386901855, "learning_rate": 9.15050988918997e-05, "loss": 0.6026, "step": 1579 }, { "epoch": 0.8090117767537123, "grad_norm": 0.5657625794410706, "learning_rate": 9.148848375022e-05, "loss": 0.603, "step": 1580 }, { "epoch": 0.8095238095238095, "grad_norm": 0.637345016002655, "learning_rate": 9.14718538872104e-05, "loss": 0.6132, "step": 1581 }, { "epoch": 0.8100358422939068, "grad_norm": 0.5459932684898376, "learning_rate": 9.145520930877169e-05, "loss": 0.6002, "step": 1582 }, { "epoch": 0.810547875064004, "grad_norm": 0.6582471132278442, "learning_rate": 9.143855002080984e-05, "loss": 0.602, "step": 1583 }, { "epoch": 0.8110599078341014, "grad_norm": 0.6331306099891663, "learning_rate": 9.142187602923604e-05, "loss": 0.6207, "step": 1584 }, { "epoch": 0.8115719406041987, "grad_norm": 0.6224691867828369, "learning_rate": 9.140518733996672e-05, "loss": 0.6285, "step": 1585 }, { "epoch": 0.8120839733742959, "grad_norm": 0.45498016476631165, "learning_rate": 9.138848395892353e-05, "loss": 0.6139, "step": 1586 }, { "epoch": 0.8125960061443932, "grad_norm": 0.5101943612098694, "learning_rate": 9.13717658920333e-05, "loss": 0.6204, "step": 1587 }, { "epoch": 0.8131080389144906, "grad_norm": 0.6976394057273865, "learning_rate": 9.135503314522808e-05, "loss": 0.6146, "step": 1588 }, { "epoch": 0.8136200716845878, "grad_norm": 0.8608499765396118, "learning_rate": 9.133828572444519e-05, "loss": 0.6087, "step": 1589 }, { "epoch": 0.8141321044546851, "grad_norm": 0.7408691644668579, "learning_rate": 9.132152363562703e-05, "loss": 0.6012, "step": 1590 }, { "epoch": 0.8146441372247823, "grad_norm": 0.522982120513916, "learning_rate": 9.130474688472135e-05, "loss": 0.5913, "step": 1591 }, { "epoch": 0.8151561699948797, "grad_norm": 0.6068546772003174, "learning_rate": 9.128795547768099e-05, "loss": 0.6338, "step": 1592 }, { "epoch": 0.815668202764977, "grad_norm": 0.7435458302497864, "learning_rate": 9.127114942046407e-05, "loss": 0.6202, "step": 1593 }, { "epoch": 0.8161802355350742, "grad_norm": 0.701773464679718, "learning_rate": 9.125432871903384e-05, "loss": 0.6344, "step": 1594 }, { "epoch": 0.8166922683051715, "grad_norm": 0.606708824634552, "learning_rate": 9.123749337935878e-05, "loss": 0.6233, "step": 1595 }, { "epoch": 0.8172043010752689, "grad_norm": 0.5959770679473877, "learning_rate": 9.122064340741256e-05, "loss": 0.6203, "step": 1596 }, { "epoch": 0.8177163338453661, "grad_norm": 0.5003708600997925, "learning_rate": 9.120377880917407e-05, "loss": 0.597, "step": 1597 }, { "epoch": 0.8182283666154634, "grad_norm": 0.601345419883728, "learning_rate": 9.118689959062734e-05, "loss": 0.62, "step": 1598 }, { "epoch": 0.8187403993855606, "grad_norm": 0.6083617210388184, "learning_rate": 9.117000575776163e-05, "loss": 0.6157, "step": 1599 }, { "epoch": 0.819252432155658, "grad_norm": 0.5899311304092407, "learning_rate": 9.115309731657133e-05, "loss": 0.62, "step": 1600 }, { "epoch": 0.8197644649257553, "grad_norm": 0.5587561130523682, "learning_rate": 9.11361742730561e-05, "loss": 0.5808, "step": 1601 }, { "epoch": 0.8202764976958525, "grad_norm": 0.49472445249557495, "learning_rate": 9.11192366332207e-05, "loss": 0.6271, "step": 1602 }, { "epoch": 0.8207885304659498, "grad_norm": 0.6866928339004517, "learning_rate": 9.11022844030751e-05, "loss": 0.6273, "step": 1603 }, { "epoch": 0.821300563236047, "grad_norm": 0.7449778318405151, "learning_rate": 9.108531758863445e-05, "loss": 0.6112, "step": 1604 }, { "epoch": 0.8218125960061444, "grad_norm": 0.6371899843215942, "learning_rate": 9.10683361959191e-05, "loss": 0.6484, "step": 1605 }, { "epoch": 0.8223246287762417, "grad_norm": 0.5783623456954956, "learning_rate": 9.105134023095451e-05, "loss": 0.6252, "step": 1606 }, { "epoch": 0.8228366615463389, "grad_norm": 0.8172042369842529, "learning_rate": 9.103432969977139e-05, "loss": 0.6031, "step": 1607 }, { "epoch": 0.8233486943164362, "grad_norm": 0.7044656276702881, "learning_rate": 9.101730460840552e-05, "loss": 0.625, "step": 1608 }, { "epoch": 0.8238607270865336, "grad_norm": 0.4737173914909363, "learning_rate": 9.100026496289793e-05, "loss": 0.6168, "step": 1609 }, { "epoch": 0.8243727598566308, "grad_norm": 0.48440414667129517, "learning_rate": 9.098321076929479e-05, "loss": 0.6172, "step": 1610 }, { "epoch": 0.8248847926267281, "grad_norm": 0.6114824414253235, "learning_rate": 9.096614203364741e-05, "loss": 0.5877, "step": 1611 }, { "epoch": 0.8253968253968254, "grad_norm": 0.5496321320533752, "learning_rate": 9.094905876201229e-05, "loss": 0.5967, "step": 1612 }, { "epoch": 0.8259088581669227, "grad_norm": 0.4646094739437103, "learning_rate": 9.093196096045109e-05, "loss": 0.6252, "step": 1613 }, { "epoch": 0.82642089093702, "grad_norm": 0.581976592540741, "learning_rate": 9.091484863503059e-05, "loss": 0.6138, "step": 1614 }, { "epoch": 0.8269329237071172, "grad_norm": 0.7270715236663818, "learning_rate": 9.089772179182274e-05, "loss": 0.6265, "step": 1615 }, { "epoch": 0.8274449564772145, "grad_norm": 0.8103761672973633, "learning_rate": 9.088058043690466e-05, "loss": 0.6231, "step": 1616 }, { "epoch": 0.8279569892473119, "grad_norm": 0.786429762840271, "learning_rate": 9.086342457635861e-05, "loss": 0.6086, "step": 1617 }, { "epoch": 0.8284690220174091, "grad_norm": 0.5788968801498413, "learning_rate": 9.084625421627198e-05, "loss": 0.6017, "step": 1618 }, { "epoch": 0.8289810547875064, "grad_norm": 0.6350241303443909, "learning_rate": 9.082906936273732e-05, "loss": 0.6091, "step": 1619 }, { "epoch": 0.8294930875576036, "grad_norm": 0.5785453915596008, "learning_rate": 9.081187002185231e-05, "loss": 0.6252, "step": 1620 }, { "epoch": 0.830005120327701, "grad_norm": 0.48318082094192505, "learning_rate": 9.079465619971979e-05, "loss": 0.6371, "step": 1621 }, { "epoch": 0.8305171530977983, "grad_norm": 0.9915174841880798, "learning_rate": 9.077742790244774e-05, "loss": 0.6149, "step": 1622 }, { "epoch": 0.8310291858678955, "grad_norm": 1.2352343797683716, "learning_rate": 9.076018513614923e-05, "loss": 0.638, "step": 1623 }, { "epoch": 0.8315412186379928, "grad_norm": 0.657619833946228, "learning_rate": 9.074292790694255e-05, "loss": 0.6076, "step": 1624 }, { "epoch": 0.8320532514080902, "grad_norm": 1.0812597274780273, "learning_rate": 9.072565622095103e-05, "loss": 0.6057, "step": 1625 }, { "epoch": 0.8325652841781874, "grad_norm": 1.02882981300354, "learning_rate": 9.070837008430318e-05, "loss": 0.5871, "step": 1626 }, { "epoch": 0.8330773169482847, "grad_norm": 0.8375751972198486, "learning_rate": 9.069106950313264e-05, "loss": 0.6384, "step": 1627 }, { "epoch": 0.833589349718382, "grad_norm": 1.0201014280319214, "learning_rate": 9.067375448357814e-05, "loss": 0.594, "step": 1628 }, { "epoch": 0.8341013824884793, "grad_norm": 0.8548293113708496, "learning_rate": 9.06564250317836e-05, "loss": 0.6414, "step": 1629 }, { "epoch": 0.8346134152585766, "grad_norm": 1.1353791952133179, "learning_rate": 9.063908115389794e-05, "loss": 0.6147, "step": 1630 }, { "epoch": 0.8351254480286738, "grad_norm": 0.7104995250701904, "learning_rate": 9.062172285607536e-05, "loss": 0.6099, "step": 1631 }, { "epoch": 0.8356374807987711, "grad_norm": 0.7720085978507996, "learning_rate": 9.060435014447505e-05, "loss": 0.6286, "step": 1632 }, { "epoch": 0.8361495135688684, "grad_norm": 0.9330393075942993, "learning_rate": 9.058696302526135e-05, "loss": 0.611, "step": 1633 }, { "epoch": 0.8366615463389657, "grad_norm": 0.8591739535331726, "learning_rate": 9.056956150460374e-05, "loss": 0.6365, "step": 1634 }, { "epoch": 0.837173579109063, "grad_norm": 0.7474163174629211, "learning_rate": 9.05521455886768e-05, "loss": 0.6027, "step": 1635 }, { "epoch": 0.8376856118791602, "grad_norm": 0.7739270925521851, "learning_rate": 9.053471528366018e-05, "loss": 0.6139, "step": 1636 }, { "epoch": 0.8381976446492575, "grad_norm": 0.8327764272689819, "learning_rate": 9.051727059573866e-05, "loss": 0.5849, "step": 1637 }, { "epoch": 0.8387096774193549, "grad_norm": 0.6874661445617676, "learning_rate": 9.049981153110214e-05, "loss": 0.6279, "step": 1638 }, { "epoch": 0.8392217101894521, "grad_norm": 0.6032069325447083, "learning_rate": 9.048233809594561e-05, "loss": 0.5913, "step": 1639 }, { "epoch": 0.8397337429595494, "grad_norm": 0.6024311780929565, "learning_rate": 9.046485029646917e-05, "loss": 0.6205, "step": 1640 }, { "epoch": 0.8402457757296466, "grad_norm": 0.532216489315033, "learning_rate": 9.044734813887798e-05, "loss": 0.6171, "step": 1641 }, { "epoch": 0.840757808499744, "grad_norm": 0.7559048533439636, "learning_rate": 9.042983162938233e-05, "loss": 0.6125, "step": 1642 }, { "epoch": 0.8412698412698413, "grad_norm": 0.7644152641296387, "learning_rate": 9.041230077419758e-05, "loss": 0.6116, "step": 1643 }, { "epoch": 0.8417818740399385, "grad_norm": 0.6022328734397888, "learning_rate": 9.03947555795442e-05, "loss": 0.6438, "step": 1644 }, { "epoch": 0.8422939068100358, "grad_norm": 0.48420971632003784, "learning_rate": 9.037719605164774e-05, "loss": 0.6161, "step": 1645 }, { "epoch": 0.8428059395801332, "grad_norm": 0.667635440826416, "learning_rate": 9.035962219673881e-05, "loss": 0.6089, "step": 1646 }, { "epoch": 0.8433179723502304, "grad_norm": 0.8063870072364807, "learning_rate": 9.03420340210532e-05, "loss": 0.605, "step": 1647 }, { "epoch": 0.8438300051203277, "grad_norm": 0.6678447723388672, "learning_rate": 9.032443153083161e-05, "loss": 0.6017, "step": 1648 }, { "epoch": 0.844342037890425, "grad_norm": 0.4928625822067261, "learning_rate": 9.030681473232e-05, "loss": 0.6003, "step": 1649 }, { "epoch": 0.8448540706605223, "grad_norm": 0.5460777282714844, "learning_rate": 9.02891836317693e-05, "loss": 0.6023, "step": 1650 }, { "epoch": 0.8453661034306196, "grad_norm": 0.678735613822937, "learning_rate": 9.027153823543553e-05, "loss": 0.6115, "step": 1651 }, { "epoch": 0.8458781362007168, "grad_norm": 0.5419057607650757, "learning_rate": 9.02538785495798e-05, "loss": 0.6129, "step": 1652 }, { "epoch": 0.8463901689708141, "grad_norm": 0.4795182943344116, "learning_rate": 9.023620458046828e-05, "loss": 0.5977, "step": 1653 }, { "epoch": 0.8469022017409115, "grad_norm": 0.6749387383460999, "learning_rate": 9.021851633437224e-05, "loss": 0.622, "step": 1654 }, { "epoch": 0.8474142345110087, "grad_norm": 1.0361658334732056, "learning_rate": 9.020081381756795e-05, "loss": 0.615, "step": 1655 }, { "epoch": 0.847926267281106, "grad_norm": 0.9473969340324402, "learning_rate": 9.018309703633681e-05, "loss": 0.6056, "step": 1656 }, { "epoch": 0.8484383000512032, "grad_norm": 0.7729476094245911, "learning_rate": 9.016536599696524e-05, "loss": 0.6259, "step": 1657 }, { "epoch": 0.8489503328213006, "grad_norm": 0.4852728545665741, "learning_rate": 9.014762070574471e-05, "loss": 0.6078, "step": 1658 }, { "epoch": 0.8494623655913979, "grad_norm": 0.754578709602356, "learning_rate": 9.012986116897182e-05, "loss": 0.5956, "step": 1659 }, { "epoch": 0.8499743983614951, "grad_norm": 1.0429328680038452, "learning_rate": 9.011208739294814e-05, "loss": 0.6104, "step": 1660 }, { "epoch": 0.8504864311315924, "grad_norm": 0.9008357524871826, "learning_rate": 9.009429938398034e-05, "loss": 0.6188, "step": 1661 }, { "epoch": 0.8509984639016898, "grad_norm": 0.608135998249054, "learning_rate": 9.007649714838012e-05, "loss": 0.5855, "step": 1662 }, { "epoch": 0.851510496671787, "grad_norm": 0.5830646753311157, "learning_rate": 9.005868069246422e-05, "loss": 0.6216, "step": 1663 }, { "epoch": 0.8520225294418843, "grad_norm": 0.7094408869743347, "learning_rate": 9.004085002255447e-05, "loss": 0.6094, "step": 1664 }, { "epoch": 0.8525345622119815, "grad_norm": 0.7395426034927368, "learning_rate": 9.002300514497771e-05, "loss": 0.5965, "step": 1665 }, { "epoch": 0.8530465949820788, "grad_norm": 0.7800859808921814, "learning_rate": 9.000514606606581e-05, "loss": 0.632, "step": 1666 }, { "epoch": 0.8535586277521762, "grad_norm": 0.7067683339118958, "learning_rate": 8.998727279215569e-05, "loss": 0.6285, "step": 1667 }, { "epoch": 0.8540706605222734, "grad_norm": 0.9343904852867126, "learning_rate": 8.996938532958931e-05, "loss": 0.628, "step": 1668 }, { "epoch": 0.8545826932923707, "grad_norm": 0.9200063943862915, "learning_rate": 8.995148368471371e-05, "loss": 0.6111, "step": 1669 }, { "epoch": 0.855094726062468, "grad_norm": 0.5311465263366699, "learning_rate": 8.993356786388087e-05, "loss": 0.6011, "step": 1670 }, { "epoch": 0.8556067588325653, "grad_norm": 0.7989476323127747, "learning_rate": 8.991563787344788e-05, "loss": 0.6057, "step": 1671 }, { "epoch": 0.8561187916026626, "grad_norm": 1.0181012153625488, "learning_rate": 8.989769371977678e-05, "loss": 0.6078, "step": 1672 }, { "epoch": 0.8566308243727598, "grad_norm": 1.002400517463684, "learning_rate": 8.987973540923474e-05, "loss": 0.5944, "step": 1673 }, { "epoch": 0.8571428571428571, "grad_norm": 0.7457436919212341, "learning_rate": 8.986176294819387e-05, "loss": 0.6215, "step": 1674 }, { "epoch": 0.8576548899129545, "grad_norm": 0.5610541105270386, "learning_rate": 8.984377634303131e-05, "loss": 0.6024, "step": 1675 }, { "epoch": 0.8581669226830517, "grad_norm": 1.0072689056396484, "learning_rate": 8.982577560012925e-05, "loss": 0.6226, "step": 1676 }, { "epoch": 0.858678955453149, "grad_norm": 1.0611203908920288, "learning_rate": 8.980776072587487e-05, "loss": 0.5908, "step": 1677 }, { "epoch": 0.8591909882232462, "grad_norm": 0.7125012874603271, "learning_rate": 8.978973172666039e-05, "loss": 0.6392, "step": 1678 }, { "epoch": 0.8597030209933436, "grad_norm": 0.8300865888595581, "learning_rate": 8.977168860888303e-05, "loss": 0.613, "step": 1679 }, { "epoch": 0.8602150537634409, "grad_norm": 0.9766619205474854, "learning_rate": 8.9753631378945e-05, "loss": 0.5956, "step": 1680 }, { "epoch": 0.8607270865335381, "grad_norm": 1.3505761623382568, "learning_rate": 8.973556004325355e-05, "loss": 0.618, "step": 1681 }, { "epoch": 0.8612391193036354, "grad_norm": 0.5300628542900085, "learning_rate": 8.971747460822091e-05, "loss": 0.5988, "step": 1682 }, { "epoch": 0.8617511520737328, "grad_norm": 1.1950321197509766, "learning_rate": 8.969937508026432e-05, "loss": 0.6081, "step": 1683 }, { "epoch": 0.86226318484383, "grad_norm": 1.1572154760360718, "learning_rate": 8.968126146580602e-05, "loss": 0.5973, "step": 1684 }, { "epoch": 0.8627752176139273, "grad_norm": 0.7786370515823364, "learning_rate": 8.966313377127327e-05, "loss": 0.6316, "step": 1685 }, { "epoch": 0.8632872503840245, "grad_norm": 1.0651003122329712, "learning_rate": 8.964499200309829e-05, "loss": 0.6278, "step": 1686 }, { "epoch": 0.8637992831541219, "grad_norm": 0.8564833402633667, "learning_rate": 8.962683616771833e-05, "loss": 0.608, "step": 1687 }, { "epoch": 0.8643113159242192, "grad_norm": 0.6578037142753601, "learning_rate": 8.960866627157559e-05, "loss": 0.6118, "step": 1688 }, { "epoch": 0.8648233486943164, "grad_norm": 0.665177583694458, "learning_rate": 8.959048232111729e-05, "loss": 0.6196, "step": 1689 }, { "epoch": 0.8653353814644137, "grad_norm": 0.9826841354370117, "learning_rate": 8.957228432279563e-05, "loss": 0.6158, "step": 1690 }, { "epoch": 0.8658474142345111, "grad_norm": 0.750813364982605, "learning_rate": 8.95540722830678e-05, "loss": 0.6237, "step": 1691 }, { "epoch": 0.8663594470046083, "grad_norm": 0.5553444623947144, "learning_rate": 8.953584620839595e-05, "loss": 0.6175, "step": 1692 }, { "epoch": 0.8668714797747056, "grad_norm": 0.8312572836875916, "learning_rate": 8.951760610524724e-05, "loss": 0.625, "step": 1693 }, { "epoch": 0.8673835125448028, "grad_norm": 1.0072152614593506, "learning_rate": 8.949935198009378e-05, "loss": 0.5974, "step": 1694 }, { "epoch": 0.8678955453149002, "grad_norm": 0.7129234671592712, "learning_rate": 8.948108383941269e-05, "loss": 0.6161, "step": 1695 }, { "epoch": 0.8684075780849975, "grad_norm": 0.7264617681503296, "learning_rate": 8.946280168968601e-05, "loss": 0.6035, "step": 1696 }, { "epoch": 0.8689196108550947, "grad_norm": 0.7299166917800903, "learning_rate": 8.94445055374008e-05, "loss": 0.5927, "step": 1697 }, { "epoch": 0.869431643625192, "grad_norm": 0.5357263684272766, "learning_rate": 8.942619538904908e-05, "loss": 0.6358, "step": 1698 }, { "epoch": 0.8699436763952892, "grad_norm": 0.7424433827400208, "learning_rate": 8.940787125112782e-05, "loss": 0.6019, "step": 1699 }, { "epoch": 0.8704557091653866, "grad_norm": 0.6726065278053284, "learning_rate": 8.938953313013895e-05, "loss": 0.5769, "step": 1700 }, { "epoch": 0.8709677419354839, "grad_norm": 0.48438695073127747, "learning_rate": 8.937118103258937e-05, "loss": 0.6149, "step": 1701 }, { "epoch": 0.8714797747055811, "grad_norm": 0.5236538648605347, "learning_rate": 8.935281496499097e-05, "loss": 0.579, "step": 1702 }, { "epoch": 0.8719918074756784, "grad_norm": 0.6117557287216187, "learning_rate": 8.933443493386056e-05, "loss": 0.6228, "step": 1703 }, { "epoch": 0.8725038402457758, "grad_norm": 0.531024694442749, "learning_rate": 8.93160409457199e-05, "loss": 0.6058, "step": 1704 }, { "epoch": 0.873015873015873, "grad_norm": 0.7144777178764343, "learning_rate": 8.929763300709571e-05, "loss": 0.6301, "step": 1705 }, { "epoch": 0.8735279057859703, "grad_norm": 0.6891056299209595, "learning_rate": 8.927921112451969e-05, "loss": 0.6261, "step": 1706 }, { "epoch": 0.8740399385560675, "grad_norm": 0.44690799713134766, "learning_rate": 8.926077530452846e-05, "loss": 0.6243, "step": 1707 }, { "epoch": 0.8745519713261649, "grad_norm": 1.0460760593414307, "learning_rate": 8.924232555366356e-05, "loss": 0.6131, "step": 1708 }, { "epoch": 0.8750640040962622, "grad_norm": 1.1975854635238647, "learning_rate": 8.922386187847154e-05, "loss": 0.5939, "step": 1709 }, { "epoch": 0.8755760368663594, "grad_norm": 0.5999739170074463, "learning_rate": 8.920538428550383e-05, "loss": 0.601, "step": 1710 }, { "epoch": 0.8760880696364567, "grad_norm": 1.1115529537200928, "learning_rate": 8.918689278131683e-05, "loss": 0.6063, "step": 1711 }, { "epoch": 0.8766001024065541, "grad_norm": 0.9265246987342834, "learning_rate": 8.916838737247188e-05, "loss": 0.6226, "step": 1712 }, { "epoch": 0.8771121351766513, "grad_norm": 0.8183275461196899, "learning_rate": 8.914986806553521e-05, "loss": 0.597, "step": 1713 }, { "epoch": 0.8776241679467486, "grad_norm": 1.0923467874526978, "learning_rate": 8.913133486707803e-05, "loss": 0.6279, "step": 1714 }, { "epoch": 0.8781362007168458, "grad_norm": 0.7898536324501038, "learning_rate": 8.911278778367646e-05, "loss": 0.5932, "step": 1715 }, { "epoch": 0.8786482334869432, "grad_norm": 0.6585994958877563, "learning_rate": 8.909422682191158e-05, "loss": 0.614, "step": 1716 }, { "epoch": 0.8791602662570405, "grad_norm": 0.67762690782547, "learning_rate": 8.90756519883693e-05, "loss": 0.6173, "step": 1717 }, { "epoch": 0.8796722990271377, "grad_norm": 0.5326299667358398, "learning_rate": 8.905706328964056e-05, "loss": 0.6093, "step": 1718 }, { "epoch": 0.880184331797235, "grad_norm": 0.7882176637649536, "learning_rate": 8.903846073232116e-05, "loss": 0.6257, "step": 1719 }, { "epoch": 0.8806963645673324, "grad_norm": 0.8547189235687256, "learning_rate": 8.901984432301185e-05, "loss": 0.6382, "step": 1720 }, { "epoch": 0.8812083973374296, "grad_norm": 0.6048323512077332, "learning_rate": 8.900121406831826e-05, "loss": 0.6117, "step": 1721 }, { "epoch": 0.8817204301075269, "grad_norm": 0.6573890447616577, "learning_rate": 8.898256997485095e-05, "loss": 0.6188, "step": 1722 }, { "epoch": 0.8822324628776241, "grad_norm": 0.7889838814735413, "learning_rate": 8.896391204922539e-05, "loss": 0.613, "step": 1723 }, { "epoch": 0.8827444956477215, "grad_norm": 0.5455670952796936, "learning_rate": 8.894524029806198e-05, "loss": 0.6206, "step": 1724 }, { "epoch": 0.8832565284178188, "grad_norm": 1.1310827732086182, "learning_rate": 8.892655472798598e-05, "loss": 0.6099, "step": 1725 }, { "epoch": 0.883768561187916, "grad_norm": 0.8875821232795715, "learning_rate": 8.890785534562757e-05, "loss": 0.6233, "step": 1726 }, { "epoch": 0.8842805939580133, "grad_norm": 0.6872793436050415, "learning_rate": 8.888914215762189e-05, "loss": 0.6335, "step": 1727 }, { "epoch": 0.8847926267281107, "grad_norm": 0.6162108778953552, "learning_rate": 8.887041517060888e-05, "loss": 0.6013, "step": 1728 }, { "epoch": 0.8853046594982079, "grad_norm": 0.8425163626670837, "learning_rate": 8.885167439123343e-05, "loss": 0.628, "step": 1729 }, { "epoch": 0.8858166922683052, "grad_norm": 0.9179390072822571, "learning_rate": 8.883291982614532e-05, "loss": 0.5844, "step": 1730 }, { "epoch": 0.8863287250384024, "grad_norm": 0.6089448928833008, "learning_rate": 8.881415148199923e-05, "loss": 0.636, "step": 1731 }, { "epoch": 0.8868407578084997, "grad_norm": 0.47150376439094543, "learning_rate": 8.879536936545472e-05, "loss": 0.6212, "step": 1732 }, { "epoch": 0.8873527905785971, "grad_norm": 0.6981831192970276, "learning_rate": 8.877657348317622e-05, "loss": 0.613, "step": 1733 }, { "epoch": 0.8878648233486943, "grad_norm": 0.57545405626297, "learning_rate": 8.875776384183308e-05, "loss": 0.6461, "step": 1734 }, { "epoch": 0.8883768561187916, "grad_norm": 0.5061913728713989, "learning_rate": 8.873894044809947e-05, "loss": 0.6017, "step": 1735 }, { "epoch": 0.8888888888888888, "grad_norm": 0.5891237258911133, "learning_rate": 8.872010330865455e-05, "loss": 0.609, "step": 1736 }, { "epoch": 0.8894009216589862, "grad_norm": 0.8038243055343628, "learning_rate": 8.870125243018222e-05, "loss": 0.6152, "step": 1737 }, { "epoch": 0.8899129544290835, "grad_norm": 0.818476140499115, "learning_rate": 8.868238781937137e-05, "loss": 0.6109, "step": 1738 }, { "epoch": 0.8904249871991807, "grad_norm": 0.6069773435592651, "learning_rate": 8.866350948291568e-05, "loss": 0.6053, "step": 1739 }, { "epoch": 0.890937019969278, "grad_norm": 0.6026650071144104, "learning_rate": 8.864461742751379e-05, "loss": 0.6106, "step": 1740 }, { "epoch": 0.8914490527393754, "grad_norm": 0.7308768033981323, "learning_rate": 8.862571165986911e-05, "loss": 0.6382, "step": 1741 }, { "epoch": 0.8919610855094726, "grad_norm": 0.9851502180099487, "learning_rate": 8.860679218668997e-05, "loss": 0.6162, "step": 1742 }, { "epoch": 0.8924731182795699, "grad_norm": 0.9711754322052002, "learning_rate": 8.858785901468955e-05, "loss": 0.6113, "step": 1743 }, { "epoch": 0.8929851510496671, "grad_norm": 0.7437930107116699, "learning_rate": 8.856891215058591e-05, "loss": 0.6123, "step": 1744 }, { "epoch": 0.8934971838197645, "grad_norm": 0.7862691879272461, "learning_rate": 8.854995160110195e-05, "loss": 0.6146, "step": 1745 }, { "epoch": 0.8940092165898618, "grad_norm": 0.8337175846099854, "learning_rate": 8.853097737296543e-05, "loss": 0.5962, "step": 1746 }, { "epoch": 0.894521249359959, "grad_norm": 1.127328634262085, "learning_rate": 8.851198947290894e-05, "loss": 0.6163, "step": 1747 }, { "epoch": 0.8950332821300563, "grad_norm": 0.817255437374115, "learning_rate": 8.849298790766995e-05, "loss": 0.6187, "step": 1748 }, { "epoch": 0.8955453149001537, "grad_norm": 0.6251739859580994, "learning_rate": 8.84739726839908e-05, "loss": 0.6235, "step": 1749 }, { "epoch": 0.8960573476702509, "grad_norm": 1.1205369234085083, "learning_rate": 8.845494380861862e-05, "loss": 0.6516, "step": 1750 }, { "epoch": 0.8965693804403482, "grad_norm": 0.995013952255249, "learning_rate": 8.843590128830544e-05, "loss": 0.6078, "step": 1751 }, { "epoch": 0.8970814132104454, "grad_norm": 0.7903568744659424, "learning_rate": 8.841684512980806e-05, "loss": 0.6118, "step": 1752 }, { "epoch": 0.8975934459805428, "grad_norm": 0.4267909526824951, "learning_rate": 8.83977753398882e-05, "loss": 0.6153, "step": 1753 }, { "epoch": 0.8981054787506401, "grad_norm": 0.7952775955200195, "learning_rate": 8.837869192531236e-05, "loss": 0.6133, "step": 1754 }, { "epoch": 0.8986175115207373, "grad_norm": 0.9760710000991821, "learning_rate": 8.835959489285192e-05, "loss": 0.5977, "step": 1755 }, { "epoch": 0.8991295442908346, "grad_norm": 0.9331663846969604, "learning_rate": 8.834048424928305e-05, "loss": 0.6049, "step": 1756 }, { "epoch": 0.899641577060932, "grad_norm": 0.7779060006141663, "learning_rate": 8.832136000138676e-05, "loss": 0.6207, "step": 1757 }, { "epoch": 0.9001536098310292, "grad_norm": 0.582532525062561, "learning_rate": 8.83022221559489e-05, "loss": 0.6102, "step": 1758 }, { "epoch": 0.9006656426011265, "grad_norm": 0.7223466634750366, "learning_rate": 8.828307071976015e-05, "loss": 0.6116, "step": 1759 }, { "epoch": 0.9011776753712237, "grad_norm": 0.8213663697242737, "learning_rate": 8.826390569961598e-05, "loss": 0.5842, "step": 1760 }, { "epoch": 0.901689708141321, "grad_norm": 0.7164159417152405, "learning_rate": 8.824472710231673e-05, "loss": 0.6205, "step": 1761 }, { "epoch": 0.9022017409114184, "grad_norm": 0.4892750084400177, "learning_rate": 8.82255349346675e-05, "loss": 0.591, "step": 1762 }, { "epoch": 0.9027137736815156, "grad_norm": 0.7776530981063843, "learning_rate": 8.820632920347823e-05, "loss": 0.5993, "step": 1763 }, { "epoch": 0.9032258064516129, "grad_norm": 1.175022006034851, "learning_rate": 8.81871099155637e-05, "loss": 0.6127, "step": 1764 }, { "epoch": 0.9037378392217101, "grad_norm": 0.5878638029098511, "learning_rate": 8.816787707774347e-05, "loss": 0.6111, "step": 1765 }, { "epoch": 0.9042498719918075, "grad_norm": 0.6462081074714661, "learning_rate": 8.81486306968419e-05, "loss": 0.6178, "step": 1766 }, { "epoch": 0.9047619047619048, "grad_norm": 0.9615750908851624, "learning_rate": 8.81293707796882e-05, "loss": 0.6025, "step": 1767 }, { "epoch": 0.905273937532002, "grad_norm": 0.8272503018379211, "learning_rate": 8.811009733311632e-05, "loss": 0.6159, "step": 1768 }, { "epoch": 0.9057859703020993, "grad_norm": 0.6217873096466064, "learning_rate": 8.809081036396505e-05, "loss": 0.5727, "step": 1769 }, { "epoch": 0.9062980030721967, "grad_norm": 0.7474400997161865, "learning_rate": 8.8071509879078e-05, "loss": 0.6258, "step": 1770 }, { "epoch": 0.9068100358422939, "grad_norm": 0.5759752988815308, "learning_rate": 8.80521958853035e-05, "loss": 0.6178, "step": 1771 }, { "epoch": 0.9073220686123912, "grad_norm": 0.5036731362342834, "learning_rate": 8.803286838949477e-05, "loss": 0.5964, "step": 1772 }, { "epoch": 0.9078341013824884, "grad_norm": 0.6920586824417114, "learning_rate": 8.801352739850974e-05, "loss": 0.6021, "step": 1773 }, { "epoch": 0.9083461341525858, "grad_norm": 0.6652421951293945, "learning_rate": 8.799417291921117e-05, "loss": 0.6324, "step": 1774 }, { "epoch": 0.9088581669226831, "grad_norm": 0.561606228351593, "learning_rate": 8.797480495846659e-05, "loss": 0.6182, "step": 1775 }, { "epoch": 0.9093701996927803, "grad_norm": 0.514660656452179, "learning_rate": 8.795542352314835e-05, "loss": 0.6131, "step": 1776 }, { "epoch": 0.9098822324628776, "grad_norm": 0.5046374201774597, "learning_rate": 8.79360286201335e-05, "loss": 0.6145, "step": 1777 }, { "epoch": 0.910394265232975, "grad_norm": 0.4954206049442291, "learning_rate": 8.791662025630398e-05, "loss": 0.6065, "step": 1778 }, { "epoch": 0.9109062980030722, "grad_norm": 0.5050227046012878, "learning_rate": 8.789719843854639e-05, "loss": 0.6156, "step": 1779 }, { "epoch": 0.9114183307731695, "grad_norm": 0.4614417254924774, "learning_rate": 8.78777631737522e-05, "loss": 0.6183, "step": 1780 }, { "epoch": 0.9119303635432667, "grad_norm": 0.47474405169487, "learning_rate": 8.785831446881759e-05, "loss": 0.614, "step": 1781 }, { "epoch": 0.9124423963133641, "grad_norm": 0.5478948354721069, "learning_rate": 8.783885233064353e-05, "loss": 0.6274, "step": 1782 }, { "epoch": 0.9129544290834614, "grad_norm": 0.694898247718811, "learning_rate": 8.781937676613577e-05, "loss": 0.5928, "step": 1783 }, { "epoch": 0.9134664618535586, "grad_norm": 0.9866032600402832, "learning_rate": 8.779988778220478e-05, "loss": 0.5865, "step": 1784 }, { "epoch": 0.9139784946236559, "grad_norm": 0.48460525274276733, "learning_rate": 8.778038538576588e-05, "loss": 0.6033, "step": 1785 }, { "epoch": 0.9144905273937532, "grad_norm": 0.4385916292667389, "learning_rate": 8.776086958373904e-05, "loss": 0.6047, "step": 1786 }, { "epoch": 0.9150025601638505, "grad_norm": 0.6898157596588135, "learning_rate": 8.774134038304907e-05, "loss": 0.6092, "step": 1787 }, { "epoch": 0.9155145929339478, "grad_norm": 0.8064818382263184, "learning_rate": 8.772179779062548e-05, "loss": 0.6239, "step": 1788 }, { "epoch": 0.916026625704045, "grad_norm": 0.7898547649383545, "learning_rate": 8.770224181340258e-05, "loss": 0.6013, "step": 1789 }, { "epoch": 0.9165386584741424, "grad_norm": 0.6296464204788208, "learning_rate": 8.768267245831937e-05, "loss": 0.6114, "step": 1790 }, { "epoch": 0.9170506912442397, "grad_norm": 0.42064476013183594, "learning_rate": 8.766308973231964e-05, "loss": 0.6173, "step": 1791 }, { "epoch": 0.9175627240143369, "grad_norm": 0.48058849573135376, "learning_rate": 8.764349364235196e-05, "loss": 0.6255, "step": 1792 }, { "epoch": 0.9180747567844342, "grad_norm": 0.5768174529075623, "learning_rate": 8.762388419536953e-05, "loss": 0.6185, "step": 1793 }, { "epoch": 0.9185867895545314, "grad_norm": 0.6031467318534851, "learning_rate": 8.76042613983304e-05, "loss": 0.6031, "step": 1794 }, { "epoch": 0.9190988223246288, "grad_norm": 0.5830101370811462, "learning_rate": 8.75846252581973e-05, "loss": 0.5928, "step": 1795 }, { "epoch": 0.9196108550947261, "grad_norm": 0.5765612721443176, "learning_rate": 8.756497578193772e-05, "loss": 0.5958, "step": 1796 }, { "epoch": 0.9201228878648233, "grad_norm": 0.6832147240638733, "learning_rate": 8.754531297652384e-05, "loss": 0.6073, "step": 1797 }, { "epoch": 0.9206349206349206, "grad_norm": 0.8681760430335999, "learning_rate": 8.752563684893262e-05, "loss": 0.6324, "step": 1798 }, { "epoch": 0.921146953405018, "grad_norm": 0.9135375618934631, "learning_rate": 8.750594740614573e-05, "loss": 0.6456, "step": 1799 }, { "epoch": 0.9216589861751152, "grad_norm": 0.6388181447982788, "learning_rate": 8.748624465514957e-05, "loss": 0.5714, "step": 1800 }, { "epoch": 0.9221710189452125, "grad_norm": 0.9249054789543152, "learning_rate": 8.746652860293523e-05, "loss": 0.6101, "step": 1801 }, { "epoch": 0.9226830517153097, "grad_norm": 0.9336694478988647, "learning_rate": 8.744679925649855e-05, "loss": 0.6164, "step": 1802 }, { "epoch": 0.9231950844854071, "grad_norm": 0.9151612520217896, "learning_rate": 8.742705662284009e-05, "loss": 0.6035, "step": 1803 }, { "epoch": 0.9237071172555044, "grad_norm": 1.0828979015350342, "learning_rate": 8.74073007089651e-05, "loss": 0.6165, "step": 1804 }, { "epoch": 0.9242191500256016, "grad_norm": 0.9234681129455566, "learning_rate": 8.738753152188358e-05, "loss": 0.6131, "step": 1805 }, { "epoch": 0.9247311827956989, "grad_norm": 0.7952959537506104, "learning_rate": 8.736774906861018e-05, "loss": 0.6341, "step": 1806 }, { "epoch": 0.9252432155657963, "grad_norm": 0.5798991322517395, "learning_rate": 8.734795335616432e-05, "loss": 0.6018, "step": 1807 }, { "epoch": 0.9257552483358935, "grad_norm": 0.8766960501670837, "learning_rate": 8.732814439157011e-05, "loss": 0.6415, "step": 1808 }, { "epoch": 0.9262672811059908, "grad_norm": 0.8110509514808655, "learning_rate": 8.730832218185633e-05, "loss": 0.5967, "step": 1809 }, { "epoch": 0.926779313876088, "grad_norm": 0.6414927840232849, "learning_rate": 8.72884867340565e-05, "loss": 0.6366, "step": 1810 }, { "epoch": 0.9272913466461854, "grad_norm": 0.7525176405906677, "learning_rate": 8.72686380552088e-05, "loss": 0.616, "step": 1811 }, { "epoch": 0.9278033794162827, "grad_norm": 0.8867536783218384, "learning_rate": 8.724877615235612e-05, "loss": 0.5995, "step": 1812 }, { "epoch": 0.9283154121863799, "grad_norm": 0.9028398394584656, "learning_rate": 8.722890103254605e-05, "loss": 0.6053, "step": 1813 }, { "epoch": 0.9288274449564772, "grad_norm": 0.6045601963996887, "learning_rate": 8.720901270283088e-05, "loss": 0.5961, "step": 1814 }, { "epoch": 0.9293394777265745, "grad_norm": 0.6283228993415833, "learning_rate": 8.718911117026755e-05, "loss": 0.6062, "step": 1815 }, { "epoch": 0.9298515104966718, "grad_norm": 0.6859341263771057, "learning_rate": 8.716919644191774e-05, "loss": 0.6002, "step": 1816 }, { "epoch": 0.9303635432667691, "grad_norm": 0.6847391128540039, "learning_rate": 8.714926852484773e-05, "loss": 0.5933, "step": 1817 }, { "epoch": 0.9308755760368663, "grad_norm": 0.7276634573936462, "learning_rate": 8.712932742612858e-05, "loss": 0.6023, "step": 1818 }, { "epoch": 0.9313876088069637, "grad_norm": 0.8076296448707581, "learning_rate": 8.710937315283594e-05, "loss": 0.6032, "step": 1819 }, { "epoch": 0.931899641577061, "grad_norm": 0.7426518201828003, "learning_rate": 8.708940571205018e-05, "loss": 0.6175, "step": 1820 }, { "epoch": 0.9324116743471582, "grad_norm": 0.5047186613082886, "learning_rate": 8.706942511085635e-05, "loss": 0.6176, "step": 1821 }, { "epoch": 0.9329237071172555, "grad_norm": 0.8018460869789124, "learning_rate": 8.704943135634413e-05, "loss": 0.6253, "step": 1822 }, { "epoch": 0.9334357398873528, "grad_norm": 1.0259642601013184, "learning_rate": 8.70294244556079e-05, "loss": 0.6241, "step": 1823 }, { "epoch": 0.9339477726574501, "grad_norm": 0.9280132055282593, "learning_rate": 8.700940441574669e-05, "loss": 0.6323, "step": 1824 }, { "epoch": 0.9344598054275474, "grad_norm": 0.7550615668296814, "learning_rate": 8.698937124386421e-05, "loss": 0.6061, "step": 1825 }, { "epoch": 0.9349718381976446, "grad_norm": 0.47823408246040344, "learning_rate": 8.696932494706882e-05, "loss": 0.6257, "step": 1826 }, { "epoch": 0.9354838709677419, "grad_norm": 0.7385604381561279, "learning_rate": 8.694926553247352e-05, "loss": 0.6294, "step": 1827 }, { "epoch": 0.9359959037378393, "grad_norm": 0.801995575428009, "learning_rate": 8.692919300719595e-05, "loss": 0.6008, "step": 1828 }, { "epoch": 0.9365079365079365, "grad_norm": 0.5411646366119385, "learning_rate": 8.690910737835849e-05, "loss": 0.6074, "step": 1829 }, { "epoch": 0.9370199692780338, "grad_norm": 0.7678266763687134, "learning_rate": 8.688900865308807e-05, "loss": 0.5757, "step": 1830 }, { "epoch": 0.937532002048131, "grad_norm": 1.0773032903671265, "learning_rate": 8.68688968385163e-05, "loss": 0.6059, "step": 1831 }, { "epoch": 0.9380440348182284, "grad_norm": 0.9248655438423157, "learning_rate": 8.684877194177947e-05, "loss": 0.6263, "step": 1832 }, { "epoch": 0.9385560675883257, "grad_norm": 0.6628385186195374, "learning_rate": 8.682863397001847e-05, "loss": 0.6111, "step": 1833 }, { "epoch": 0.9390681003584229, "grad_norm": 0.7859993577003479, "learning_rate": 8.680848293037885e-05, "loss": 0.621, "step": 1834 }, { "epoch": 0.9395801331285202, "grad_norm": 0.9431124329566956, "learning_rate": 8.678831883001077e-05, "loss": 0.6081, "step": 1835 }, { "epoch": 0.9400921658986175, "grad_norm": 0.6432191729545593, "learning_rate": 8.676814167606906e-05, "loss": 0.5983, "step": 1836 }, { "epoch": 0.9406041986687148, "grad_norm": 0.6938913464546204, "learning_rate": 8.674795147571317e-05, "loss": 0.5848, "step": 1837 }, { "epoch": 0.9411162314388121, "grad_norm": 0.5149176120758057, "learning_rate": 8.672774823610716e-05, "loss": 0.6094, "step": 1838 }, { "epoch": 0.9416282642089093, "grad_norm": 0.6561318635940552, "learning_rate": 8.670753196441973e-05, "loss": 0.6035, "step": 1839 }, { "epoch": 0.9421402969790067, "grad_norm": 0.5405431985855103, "learning_rate": 8.668730266782423e-05, "loss": 0.6128, "step": 1840 }, { "epoch": 0.942652329749104, "grad_norm": 0.6093837022781372, "learning_rate": 8.666706035349857e-05, "loss": 0.5936, "step": 1841 }, { "epoch": 0.9431643625192012, "grad_norm": 0.5754795074462891, "learning_rate": 8.664680502862536e-05, "loss": 0.6242, "step": 1842 }, { "epoch": 0.9436763952892985, "grad_norm": 0.6382490992546082, "learning_rate": 8.662653670039175e-05, "loss": 0.6271, "step": 1843 }, { "epoch": 0.9441884280593958, "grad_norm": 0.6652403473854065, "learning_rate": 8.660625537598954e-05, "loss": 0.595, "step": 1844 }, { "epoch": 0.9447004608294931, "grad_norm": 0.5887729525566101, "learning_rate": 8.658596106261515e-05, "loss": 0.637, "step": 1845 }, { "epoch": 0.9452124935995904, "grad_norm": 0.6836559772491455, "learning_rate": 8.656565376746959e-05, "loss": 0.6255, "step": 1846 }, { "epoch": 0.9457245263696876, "grad_norm": 0.5295282006263733, "learning_rate": 8.654533349775848e-05, "loss": 0.5979, "step": 1847 }, { "epoch": 0.946236559139785, "grad_norm": 0.6576597094535828, "learning_rate": 8.652500026069208e-05, "loss": 0.5863, "step": 1848 }, { "epoch": 0.9467485919098823, "grad_norm": 0.5399554967880249, "learning_rate": 8.650465406348517e-05, "loss": 0.5771, "step": 1849 }, { "epoch": 0.9472606246799795, "grad_norm": 0.5018578171730042, "learning_rate": 8.648429491335721e-05, "loss": 0.5953, "step": 1850 }, { "epoch": 0.9477726574500768, "grad_norm": 0.8756446242332458, "learning_rate": 8.646392281753219e-05, "loss": 0.5845, "step": 1851 }, { "epoch": 0.9482846902201741, "grad_norm": 0.8809206485748291, "learning_rate": 8.644353778323877e-05, "loss": 0.5816, "step": 1852 }, { "epoch": 0.9487967229902714, "grad_norm": 0.6588075160980225, "learning_rate": 8.642313981771013e-05, "loss": 0.6192, "step": 1853 }, { "epoch": 0.9493087557603687, "grad_norm": 0.5527541637420654, "learning_rate": 8.640272892818407e-05, "loss": 0.5983, "step": 1854 }, { "epoch": 0.9498207885304659, "grad_norm": 0.5538603663444519, "learning_rate": 8.638230512190298e-05, "loss": 0.598, "step": 1855 }, { "epoch": 0.9503328213005633, "grad_norm": 0.6122649312019348, "learning_rate": 8.63618684061138e-05, "loss": 0.618, "step": 1856 }, { "epoch": 0.9508448540706606, "grad_norm": 0.6477901935577393, "learning_rate": 8.63414187880681e-05, "loss": 0.5978, "step": 1857 }, { "epoch": 0.9513568868407578, "grad_norm": 0.7201472520828247, "learning_rate": 8.6320956275022e-05, "loss": 0.5973, "step": 1858 }, { "epoch": 0.9518689196108551, "grad_norm": 0.7655221223831177, "learning_rate": 8.63004808742362e-05, "loss": 0.6349, "step": 1859 }, { "epoch": 0.9523809523809523, "grad_norm": 0.812676727771759, "learning_rate": 8.627999259297597e-05, "loss": 0.6148, "step": 1860 }, { "epoch": 0.9528929851510497, "grad_norm": 0.7688848972320557, "learning_rate": 8.625949143851116e-05, "loss": 0.609, "step": 1861 }, { "epoch": 0.953405017921147, "grad_norm": 0.5826058983802795, "learning_rate": 8.623897741811615e-05, "loss": 0.6242, "step": 1862 }, { "epoch": 0.9539170506912442, "grad_norm": 0.5536845922470093, "learning_rate": 8.621845053906997e-05, "loss": 0.5868, "step": 1863 }, { "epoch": 0.9544290834613415, "grad_norm": 0.7110509276390076, "learning_rate": 8.619791080865609e-05, "loss": 0.6132, "step": 1864 }, { "epoch": 0.9549411162314388, "grad_norm": 0.5267532467842102, "learning_rate": 8.617735823416267e-05, "loss": 0.6155, "step": 1865 }, { "epoch": 0.9554531490015361, "grad_norm": 0.531934916973114, "learning_rate": 8.615679282288234e-05, "loss": 0.6045, "step": 1866 }, { "epoch": 0.9559651817716334, "grad_norm": 1.0107884407043457, "learning_rate": 8.61362145821123e-05, "loss": 0.6135, "step": 1867 }, { "epoch": 0.9564772145417306, "grad_norm": 0.6162605285644531, "learning_rate": 8.611562351915432e-05, "loss": 0.6043, "step": 1868 }, { "epoch": 0.956989247311828, "grad_norm": 0.6790778636932373, "learning_rate": 8.609501964131472e-05, "loss": 0.597, "step": 1869 }, { "epoch": 0.9575012800819253, "grad_norm": 0.6799835562705994, "learning_rate": 8.607440295590436e-05, "loss": 0.6175, "step": 1870 }, { "epoch": 0.9580133128520225, "grad_norm": 0.7091780304908752, "learning_rate": 8.605377347023865e-05, "loss": 0.6069, "step": 1871 }, { "epoch": 0.9585253456221198, "grad_norm": 0.7080487608909607, "learning_rate": 8.603313119163749e-05, "loss": 0.5986, "step": 1872 }, { "epoch": 0.9590373783922171, "grad_norm": 0.495822012424469, "learning_rate": 8.601247612742544e-05, "loss": 0.5927, "step": 1873 }, { "epoch": 0.9595494111623144, "grad_norm": 0.5208033323287964, "learning_rate": 8.599180828493147e-05, "loss": 0.6097, "step": 1874 }, { "epoch": 0.9600614439324117, "grad_norm": 0.4554497003555298, "learning_rate": 8.597112767148917e-05, "loss": 0.6084, "step": 1875 }, { "epoch": 0.9605734767025089, "grad_norm": 0.5212755799293518, "learning_rate": 8.595043429443658e-05, "loss": 0.591, "step": 1876 }, { "epoch": 0.9610855094726063, "grad_norm": 0.5193934440612793, "learning_rate": 8.592972816111635e-05, "loss": 0.603, "step": 1877 }, { "epoch": 0.9615975422427036, "grad_norm": 0.6795120239257812, "learning_rate": 8.590900927887562e-05, "loss": 0.6092, "step": 1878 }, { "epoch": 0.9621095750128008, "grad_norm": 0.7339894771575928, "learning_rate": 8.588827765506605e-05, "loss": 0.6053, "step": 1879 }, { "epoch": 0.9626216077828981, "grad_norm": 0.6276906728744507, "learning_rate": 8.586753329704385e-05, "loss": 0.6032, "step": 1880 }, { "epoch": 0.9631336405529954, "grad_norm": 0.5775157809257507, "learning_rate": 8.584677621216967e-05, "loss": 0.5787, "step": 1881 }, { "epoch": 0.9636456733230927, "grad_norm": 0.6808149218559265, "learning_rate": 8.58260064078088e-05, "loss": 0.6096, "step": 1882 }, { "epoch": 0.96415770609319, "grad_norm": 0.6674193739891052, "learning_rate": 8.580522389133094e-05, "loss": 0.6078, "step": 1883 }, { "epoch": 0.9646697388632872, "grad_norm": 0.4134068489074707, "learning_rate": 8.578442867011034e-05, "loss": 0.5881, "step": 1884 }, { "epoch": 0.9651817716333846, "grad_norm": 0.5615413188934326, "learning_rate": 8.576362075152576e-05, "loss": 0.6384, "step": 1885 }, { "epoch": 0.9656938044034818, "grad_norm": 0.629902720451355, "learning_rate": 8.574280014296043e-05, "loss": 0.6365, "step": 1886 }, { "epoch": 0.9662058371735791, "grad_norm": 0.5860879421234131, "learning_rate": 8.572196685180217e-05, "loss": 0.6022, "step": 1887 }, { "epoch": 0.9667178699436764, "grad_norm": 0.5416891574859619, "learning_rate": 8.570112088544319e-05, "loss": 0.63, "step": 1888 }, { "epoch": 0.9672299027137736, "grad_norm": 0.5377897024154663, "learning_rate": 8.568026225128027e-05, "loss": 0.6375, "step": 1889 }, { "epoch": 0.967741935483871, "grad_norm": 0.5749699473381042, "learning_rate": 8.565939095671466e-05, "loss": 0.5986, "step": 1890 }, { "epoch": 0.9682539682539683, "grad_norm": 0.6093477606773376, "learning_rate": 8.563850700915212e-05, "loss": 0.6073, "step": 1891 }, { "epoch": 0.9687660010240655, "grad_norm": 0.6219916939735413, "learning_rate": 8.561761041600286e-05, "loss": 0.6003, "step": 1892 }, { "epoch": 0.9692780337941628, "grad_norm": 0.5532930493354797, "learning_rate": 8.559670118468163e-05, "loss": 0.6073, "step": 1893 }, { "epoch": 0.9697900665642601, "grad_norm": 0.6049473881721497, "learning_rate": 8.557577932260763e-05, "loss": 0.5924, "step": 1894 }, { "epoch": 0.9703020993343574, "grad_norm": 0.7536075711250305, "learning_rate": 8.555484483720455e-05, "loss": 0.6149, "step": 1895 }, { "epoch": 0.9708141321044547, "grad_norm": 0.6211358308792114, "learning_rate": 8.553389773590055e-05, "loss": 0.6173, "step": 1896 }, { "epoch": 0.9713261648745519, "grad_norm": 0.561103880405426, "learning_rate": 8.551293802612828e-05, "loss": 0.5839, "step": 1897 }, { "epoch": 0.9718381976446493, "grad_norm": 0.5376758575439453, "learning_rate": 8.549196571532486e-05, "loss": 0.6067, "step": 1898 }, { "epoch": 0.9723502304147466, "grad_norm": 0.636143684387207, "learning_rate": 8.54709808109319e-05, "loss": 0.6168, "step": 1899 }, { "epoch": 0.9728622631848438, "grad_norm": 0.5275603532791138, "learning_rate": 8.544998332039542e-05, "loss": 0.6122, "step": 1900 }, { "epoch": 0.9733742959549411, "grad_norm": 0.7172293066978455, "learning_rate": 8.542897325116596e-05, "loss": 0.6233, "step": 1901 }, { "epoch": 0.9738863287250384, "grad_norm": 0.8159469366073608, "learning_rate": 8.540795061069855e-05, "loss": 0.6264, "step": 1902 }, { "epoch": 0.9743983614951357, "grad_norm": 0.6992416381835938, "learning_rate": 8.538691540645258e-05, "loss": 0.5923, "step": 1903 }, { "epoch": 0.974910394265233, "grad_norm": 0.6905255317687988, "learning_rate": 8.536586764589199e-05, "loss": 0.602, "step": 1904 }, { "epoch": 0.9754224270353302, "grad_norm": 0.7247764468193054, "learning_rate": 8.534480733648511e-05, "loss": 0.6144, "step": 1905 }, { "epoch": 0.9759344598054276, "grad_norm": 0.6376498937606812, "learning_rate": 8.532373448570478e-05, "loss": 0.589, "step": 1906 }, { "epoch": 0.9764464925755248, "grad_norm": 0.5382902026176453, "learning_rate": 8.530264910102829e-05, "loss": 0.6252, "step": 1907 }, { "epoch": 0.9769585253456221, "grad_norm": 0.7567707896232605, "learning_rate": 8.52815511899373e-05, "loss": 0.5893, "step": 1908 }, { "epoch": 0.9774705581157194, "grad_norm": 0.950793981552124, "learning_rate": 8.526044075991802e-05, "loss": 0.5928, "step": 1909 }, { "epoch": 0.9779825908858167, "grad_norm": 0.7355795502662659, "learning_rate": 8.523931781846099e-05, "loss": 0.594, "step": 1910 }, { "epoch": 0.978494623655914, "grad_norm": 0.5082160234451294, "learning_rate": 8.52181823730613e-05, "loss": 0.6182, "step": 1911 }, { "epoch": 0.9790066564260113, "grad_norm": 0.6597242951393127, "learning_rate": 8.519703443121839e-05, "loss": 0.6113, "step": 1912 }, { "epoch": 0.9795186891961085, "grad_norm": 0.5583549737930298, "learning_rate": 8.517587400043619e-05, "loss": 0.6177, "step": 1913 }, { "epoch": 0.9800307219662059, "grad_norm": 0.4815637171268463, "learning_rate": 8.515470108822303e-05, "loss": 0.6233, "step": 1914 }, { "epoch": 0.9805427547363031, "grad_norm": 0.48437929153442383, "learning_rate": 8.513351570209171e-05, "loss": 0.6157, "step": 1915 }, { "epoch": 0.9810547875064004, "grad_norm": 0.4349682033061981, "learning_rate": 8.511231784955937e-05, "loss": 0.5968, "step": 1916 }, { "epoch": 0.9815668202764977, "grad_norm": 0.5066204071044922, "learning_rate": 8.509110753814769e-05, "loss": 0.6013, "step": 1917 }, { "epoch": 0.982078853046595, "grad_norm": 0.4891235828399658, "learning_rate": 8.506988477538267e-05, "loss": 0.6088, "step": 1918 }, { "epoch": 0.9825908858166923, "grad_norm": 0.6094465255737305, "learning_rate": 8.504864956879477e-05, "loss": 0.6154, "step": 1919 }, { "epoch": 0.9831029185867896, "grad_norm": 0.6717943549156189, "learning_rate": 8.502740192591887e-05, "loss": 0.5962, "step": 1920 }, { "epoch": 0.9836149513568868, "grad_norm": 0.7286078929901123, "learning_rate": 8.500614185429428e-05, "loss": 0.6247, "step": 1921 }, { "epoch": 0.9841269841269841, "grad_norm": 1.0690590143203735, "learning_rate": 8.498486936146465e-05, "loss": 0.6126, "step": 1922 }, { "epoch": 0.9846390168970814, "grad_norm": 0.9175248146057129, "learning_rate": 8.496358445497811e-05, "loss": 0.5948, "step": 1923 }, { "epoch": 0.9851510496671787, "grad_norm": 0.7829539179801941, "learning_rate": 8.494228714238719e-05, "loss": 0.6133, "step": 1924 }, { "epoch": 0.985663082437276, "grad_norm": 0.7345153093338013, "learning_rate": 8.492097743124877e-05, "loss": 0.6012, "step": 1925 }, { "epoch": 0.9861751152073732, "grad_norm": 0.9032869935035706, "learning_rate": 8.489965532912417e-05, "loss": 0.6131, "step": 1926 }, { "epoch": 0.9866871479774706, "grad_norm": 1.0448259115219116, "learning_rate": 8.487832084357909e-05, "loss": 0.6114, "step": 1927 }, { "epoch": 0.9871991807475679, "grad_norm": 0.796886682510376, "learning_rate": 8.485697398218363e-05, "loss": 0.613, "step": 1928 }, { "epoch": 0.9877112135176651, "grad_norm": 0.7067273855209351, "learning_rate": 8.483561475251229e-05, "loss": 0.5902, "step": 1929 }, { "epoch": 0.9882232462877624, "grad_norm": 0.5703265070915222, "learning_rate": 8.481424316214398e-05, "loss": 0.5973, "step": 1930 }, { "epoch": 0.9887352790578597, "grad_norm": 0.7664267420768738, "learning_rate": 8.479285921866189e-05, "loss": 0.6067, "step": 1931 }, { "epoch": 0.989247311827957, "grad_norm": 0.928532600402832, "learning_rate": 8.477146292965372e-05, "loss": 0.5984, "step": 1932 }, { "epoch": 0.9897593445980543, "grad_norm": 0.9129771590232849, "learning_rate": 8.475005430271152e-05, "loss": 0.639, "step": 1933 }, { "epoch": 0.9902713773681515, "grad_norm": 0.8429197669029236, "learning_rate": 8.472863334543166e-05, "loss": 0.5906, "step": 1934 }, { "epoch": 0.9907834101382489, "grad_norm": 0.6563157439231873, "learning_rate": 8.470720006541493e-05, "loss": 0.6195, "step": 1935 }, { "epoch": 0.9912954429083461, "grad_norm": 0.5825872421264648, "learning_rate": 8.468575447026651e-05, "loss": 0.6135, "step": 1936 }, { "epoch": 0.9918074756784434, "grad_norm": 0.5533396601676941, "learning_rate": 8.466429656759591e-05, "loss": 0.6121, "step": 1937 }, { "epoch": 0.9923195084485407, "grad_norm": 0.6714436411857605, "learning_rate": 8.4642826365017e-05, "loss": 0.6087, "step": 1938 }, { "epoch": 0.992831541218638, "grad_norm": 0.5732312202453613, "learning_rate": 8.462134387014806e-05, "loss": 0.603, "step": 1939 }, { "epoch": 0.9933435739887353, "grad_norm": 0.5527217984199524, "learning_rate": 8.459984909061173e-05, "loss": 0.598, "step": 1940 }, { "epoch": 0.9938556067588326, "grad_norm": 0.9165377616882324, "learning_rate": 8.457834203403494e-05, "loss": 0.5881, "step": 1941 }, { "epoch": 0.9943676395289298, "grad_norm": 0.9610447883605957, "learning_rate": 8.455682270804907e-05, "loss": 0.6149, "step": 1942 }, { "epoch": 0.9948796722990272, "grad_norm": 0.7240959405899048, "learning_rate": 8.453529112028977e-05, "loss": 0.5996, "step": 1943 }, { "epoch": 0.9953917050691244, "grad_norm": 0.5790973901748657, "learning_rate": 8.45137472783971e-05, "loss": 0.6143, "step": 1944 }, { "epoch": 0.9959037378392217, "grad_norm": 0.571142315864563, "learning_rate": 8.449219119001543e-05, "loss": 0.6113, "step": 1945 }, { "epoch": 0.996415770609319, "grad_norm": 0.7975537776947021, "learning_rate": 8.447062286279352e-05, "loss": 0.6057, "step": 1946 }, { "epoch": 0.9969278033794163, "grad_norm": 0.7440320253372192, "learning_rate": 8.444904230438442e-05, "loss": 0.6068, "step": 1947 }, { "epoch": 0.9974398361495136, "grad_norm": 0.4440317749977112, "learning_rate": 8.442744952244556e-05, "loss": 0.5799, "step": 1948 }, { "epoch": 0.9979518689196109, "grad_norm": 0.6356591582298279, "learning_rate": 8.440584452463869e-05, "loss": 0.6035, "step": 1949 }, { "epoch": 0.9984639016897081, "grad_norm": 0.9069144129753113, "learning_rate": 8.438422731862987e-05, "loss": 0.5986, "step": 1950 }, { "epoch": 0.9989759344598055, "grad_norm": 0.8557628393173218, "learning_rate": 8.436259791208954e-05, "loss": 0.5993, "step": 1951 }, { "epoch": 0.9994879672299027, "grad_norm": 0.6528682112693787, "learning_rate": 8.434095631269246e-05, "loss": 0.6086, "step": 1952 }, { "epoch": 1.0, "grad_norm": 0.522171139717102, "learning_rate": 8.431930252811766e-05, "loss": 0.5979, "step": 1953 }, { "epoch": 1.0, "eval_loss": 0.4175611734390259, "eval_runtime": 294.385, "eval_samples_per_second": 215.174, "eval_steps_per_second": 1.923, "step": 1953 } ], "logging_steps": 1, "max_steps": 5859, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3244271933054675e+20, "train_batch_size": 14, "trial_name": null, "trial_params": null }