{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1231567006968077, "eval_steps": 200, "global_step": 2600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021606438718738182, "grad_norm": 0.6293283104896545, "learning_rate": 8.000000000000001e-06, "loss": 2.762, "num_input_tokens_seen": 43456, "step": 5, "train_runtime": 22.3128, "train_tokens_per_second": 1947.58 }, { "epoch": 0.0043212877437476364, "grad_norm": 0.709394633769989, "learning_rate": 1.8e-05, "loss": 2.7429, "num_input_tokens_seen": 86704, "step": 10, "train_runtime": 44.8748, "train_tokens_per_second": 1932.131 }, { "epoch": 0.0064819316156214555, "grad_norm": 0.6257256269454956, "learning_rate": 2.8000000000000003e-05, "loss": 2.7046, "num_input_tokens_seen": 130448, "step": 15, "train_runtime": 66.2724, "train_tokens_per_second": 1968.361 }, { "epoch": 0.008642575487495273, "grad_norm": 0.819546103477478, "learning_rate": 3.8e-05, "loss": 2.6496, "num_input_tokens_seen": 173536, "step": 20, "train_runtime": 87.627, "train_tokens_per_second": 1980.394 }, { "epoch": 0.010803219359369093, "grad_norm": 1.0976862907409668, "learning_rate": 4.8e-05, "loss": 2.485, "num_input_tokens_seen": 217520, "step": 25, "train_runtime": 109.5133, "train_tokens_per_second": 1986.243 }, { "epoch": 0.012963863231242911, "grad_norm": 1.3042502403259277, "learning_rate": 5.8e-05, "loss": 2.2924, "num_input_tokens_seen": 260592, "step": 30, "train_runtime": 130.6452, "train_tokens_per_second": 1994.654 }, { "epoch": 0.01512450710311673, "grad_norm": 1.3307358026504517, "learning_rate": 6.800000000000001e-05, "loss": 1.9451, "num_input_tokens_seen": 304000, "step": 35, "train_runtime": 152.3406, "train_tokens_per_second": 1995.529 }, { "epoch": 0.017285150974990546, "grad_norm": 5.1953444480896, "learning_rate": 7.800000000000001e-05, "loss": 1.5215, "num_input_tokens_seen": 347632, "step": 40, "train_runtime": 175.2126, "train_tokens_per_second": 1984.059 }, { "epoch": 0.019445794846864366, "grad_norm": 2.9334633350372314, "learning_rate": 8.800000000000001e-05, "loss": 0.9511, "num_input_tokens_seen": 391584, "step": 45, "train_runtime": 196.5345, "train_tokens_per_second": 1992.444 }, { "epoch": 0.021606438718738186, "grad_norm": 1.869616150856018, "learning_rate": 9.8e-05, "loss": 0.4297, "num_input_tokens_seen": 434896, "step": 50, "train_runtime": 218.7064, "train_tokens_per_second": 1988.493 }, { "epoch": 0.023767082590612002, "grad_norm": 0.39343124628067017, "learning_rate": 0.00010800000000000001, "loss": 0.2177, "num_input_tokens_seen": 478208, "step": 55, "train_runtime": 240.285, "train_tokens_per_second": 1990.17 }, { "epoch": 0.025927726462485822, "grad_norm": 0.3148583173751831, "learning_rate": 0.000118, "loss": 0.1802, "num_input_tokens_seen": 521056, "step": 60, "train_runtime": 261.4811, "train_tokens_per_second": 1992.71 }, { "epoch": 0.02808837033435964, "grad_norm": 0.1977643072605133, "learning_rate": 0.00012800000000000002, "loss": 0.1738, "num_input_tokens_seen": 564816, "step": 65, "train_runtime": 282.7387, "train_tokens_per_second": 1997.661 }, { "epoch": 0.03024901420623346, "grad_norm": 0.27150753140449524, "learning_rate": 0.000138, "loss": 0.1355, "num_input_tokens_seen": 607936, "step": 70, "train_runtime": 304.3102, "train_tokens_per_second": 1997.751 }, { "epoch": 0.03240965807810728, "grad_norm": 0.14433489739894867, "learning_rate": 0.000148, "loss": 0.131, "num_input_tokens_seen": 651184, "step": 75, "train_runtime": 325.5002, "train_tokens_per_second": 2000.564 }, { "epoch": 0.03457030194998109, "grad_norm": 0.14637072384357452, "learning_rate": 0.00015800000000000002, "loss": 0.135, "num_input_tokens_seen": 694992, "step": 80, "train_runtime": 346.7707, "train_tokens_per_second": 2004.183 }, { "epoch": 0.03673094582185491, "grad_norm": 0.12602286040782928, "learning_rate": 0.000168, "loss": 0.1138, "num_input_tokens_seen": 738016, "step": 85, "train_runtime": 367.8144, "train_tokens_per_second": 2006.49 }, { "epoch": 0.03889158969372873, "grad_norm": 0.11580361425876617, "learning_rate": 0.00017800000000000002, "loss": 0.1121, "num_input_tokens_seen": 781584, "step": 90, "train_runtime": 389.4092, "train_tokens_per_second": 2007.102 }, { "epoch": 0.04105223356560255, "grad_norm": 0.13435131311416626, "learning_rate": 0.000188, "loss": 0.1168, "num_input_tokens_seen": 825184, "step": 95, "train_runtime": 411.2782, "train_tokens_per_second": 2006.389 }, { "epoch": 0.04321287743747637, "grad_norm": 0.11393298953771591, "learning_rate": 0.00019800000000000002, "loss": 0.103, "num_input_tokens_seen": 868384, "step": 100, "train_runtime": 433.0861, "train_tokens_per_second": 2005.107 }, { "epoch": 0.045373521309350184, "grad_norm": 0.11593123525381088, "learning_rate": 0.00019999961523722363, "loss": 0.1113, "num_input_tokens_seen": 912000, "step": 105, "train_runtime": 456.2462, "train_tokens_per_second": 1998.921 }, { "epoch": 0.047534165181224004, "grad_norm": 0.13952848315238953, "learning_rate": 0.00019999805214351914, "loss": 0.105, "num_input_tokens_seen": 955216, "step": 110, "train_runtime": 477.4357, "train_tokens_per_second": 2000.722 }, { "epoch": 0.049694809053097824, "grad_norm": 0.09782172739505768, "learning_rate": 0.0001999952866899929, "loss": 0.0944, "num_input_tokens_seen": 998384, "step": 115, "train_runtime": 498.2711, "train_tokens_per_second": 2003.696 }, { "epoch": 0.051855452924971644, "grad_norm": 0.11062366515398026, "learning_rate": 0.00019999131890989627, "loss": 0.1028, "num_input_tokens_seen": 1041632, "step": 120, "train_runtime": 520.2058, "train_tokens_per_second": 2002.346 }, { "epoch": 0.05401609679684546, "grad_norm": 0.0976124033331871, "learning_rate": 0.00019998614885093717, "loss": 0.0926, "num_input_tokens_seen": 1084576, "step": 125, "train_runtime": 541.3878, "train_tokens_per_second": 2003.326 }, { "epoch": 0.05617674066871928, "grad_norm": 0.1115734875202179, "learning_rate": 0.00019997977657527956, "loss": 0.0913, "num_input_tokens_seen": 1127856, "step": 130, "train_runtime": 562.8485, "train_tokens_per_second": 2003.836 }, { "epoch": 0.0583373845405931, "grad_norm": 0.10568796098232269, "learning_rate": 0.00019997220215954258, "loss": 0.1054, "num_input_tokens_seen": 1171424, "step": 135, "train_runtime": 584.7953, "train_tokens_per_second": 2003.135 }, { "epoch": 0.06049802841246692, "grad_norm": 0.10306553542613983, "learning_rate": 0.00019996342569479972, "loss": 0.099, "num_input_tokens_seen": 1214848, "step": 140, "train_runtime": 607.0714, "train_tokens_per_second": 2001.162 }, { "epoch": 0.06265867228434073, "grad_norm": 0.09895918518304825, "learning_rate": 0.00019995344728657773, "loss": 0.0887, "num_input_tokens_seen": 1258080, "step": 145, "train_runtime": 628.7533, "train_tokens_per_second": 2000.912 }, { "epoch": 0.06481931615621456, "grad_norm": 0.08400742709636688, "learning_rate": 0.00019994226705485538, "loss": 0.1008, "num_input_tokens_seen": 1301680, "step": 150, "train_runtime": 649.8435, "train_tokens_per_second": 2003.067 }, { "epoch": 0.06697996002808837, "grad_norm": 0.09079141914844513, "learning_rate": 0.0001999298851340618, "loss": 0.1107, "num_input_tokens_seen": 1345520, "step": 155, "train_runtime": 671.9504, "train_tokens_per_second": 2002.41 }, { "epoch": 0.06914060389996218, "grad_norm": 0.11345323175191879, "learning_rate": 0.00019991630167307523, "loss": 0.0963, "num_input_tokens_seen": 1388816, "step": 160, "train_runtime": 693.7679, "train_tokens_per_second": 2001.845 }, { "epoch": 0.07130124777183601, "grad_norm": 0.13441255688667297, "learning_rate": 0.00019990151683522086, "loss": 0.0884, "num_input_tokens_seen": 1431936, "step": 165, "train_runtime": 714.9933, "train_tokens_per_second": 2002.727 }, { "epoch": 0.07346189164370982, "grad_norm": 0.1069858968257904, "learning_rate": 0.00019988553079826912, "loss": 0.0893, "num_input_tokens_seen": 1475008, "step": 170, "train_runtime": 737.0241, "train_tokens_per_second": 2001.302 }, { "epoch": 0.07562253551558365, "grad_norm": 0.08582285791635513, "learning_rate": 0.00019986834375443344, "loss": 0.0948, "num_input_tokens_seen": 1518496, "step": 175, "train_runtime": 758.2236, "train_tokens_per_second": 2002.702 }, { "epoch": 0.07778317938745746, "grad_norm": 0.1251683533191681, "learning_rate": 0.00019984995591036797, "loss": 0.0878, "num_input_tokens_seen": 1561744, "step": 180, "train_runtime": 779.9879, "train_tokens_per_second": 2002.267 }, { "epoch": 0.07994382325933128, "grad_norm": 0.11450210958719254, "learning_rate": 0.00019983036748716504, "loss": 0.1033, "num_input_tokens_seen": 1605408, "step": 185, "train_runtime": 801.5796, "train_tokens_per_second": 2002.805 }, { "epoch": 0.0821044671312051, "grad_norm": 0.09959390759468079, "learning_rate": 0.00019980957872035258, "loss": 0.0977, "num_input_tokens_seen": 1649280, "step": 190, "train_runtime": 822.8119, "train_tokens_per_second": 2004.444 }, { "epoch": 0.08426511100307892, "grad_norm": 0.09225820004940033, "learning_rate": 0.00019978758985989128, "loss": 0.0896, "num_input_tokens_seen": 1692512, "step": 195, "train_runtime": 844.7454, "train_tokens_per_second": 2003.577 }, { "epoch": 0.08642575487495274, "grad_norm": 0.13596701622009277, "learning_rate": 0.00019976440117017144, "loss": 0.0956, "num_input_tokens_seen": 1735840, "step": 200, "train_runtime": 866.713, "train_tokens_per_second": 2002.785 }, { "epoch": 0.08642575487495274, "eval_loss": 0.09148535877466202, "eval_runtime": 710.7457, "eval_samples_per_second": 13.024, "eval_steps_per_second": 6.513, "num_input_tokens_seen": 1735840, "step": 200 }, { "epoch": 0.08858639874682656, "grad_norm": 0.11967090517282486, "learning_rate": 0.00019974001293001002, "loss": 0.1006, "num_input_tokens_seen": 1779424, "step": 205, "train_runtime": 1603.3338, "train_tokens_per_second": 1109.828 }, { "epoch": 0.09074704261870037, "grad_norm": 0.10459703952074051, "learning_rate": 0.00019971442543264712, "loss": 0.1082, "num_input_tokens_seen": 1823168, "step": 210, "train_runtime": 1624.6497, "train_tokens_per_second": 1122.191 }, { "epoch": 0.0929076864905742, "grad_norm": 0.0971212238073349, "learning_rate": 0.00019968763898574253, "loss": 0.0975, "num_input_tokens_seen": 1866528, "step": 215, "train_runtime": 1645.7576, "train_tokens_per_second": 1134.145 }, { "epoch": 0.09506833036244801, "grad_norm": 0.07356134802103043, "learning_rate": 0.000199659653911372, "loss": 0.0821, "num_input_tokens_seen": 1909808, "step": 220, "train_runtime": 1667.2345, "train_tokens_per_second": 1145.495 }, { "epoch": 0.09722897423432182, "grad_norm": 0.10070156306028366, "learning_rate": 0.00019963047054602338, "loss": 0.087, "num_input_tokens_seen": 1953008, "step": 225, "train_runtime": 1689.0691, "train_tokens_per_second": 1156.263 }, { "epoch": 0.09938961810619565, "grad_norm": 0.09706509113311768, "learning_rate": 0.00019960008924059254, "loss": 0.0993, "num_input_tokens_seen": 1996752, "step": 230, "train_runtime": 1710.7637, "train_tokens_per_second": 1167.17 }, { "epoch": 0.10155026197806946, "grad_norm": 0.09175528585910797, "learning_rate": 0.0001995685103603792, "loss": 0.0815, "num_input_tokens_seen": 2039968, "step": 235, "train_runtime": 1732.6168, "train_tokens_per_second": 1177.391 }, { "epoch": 0.10371090584994329, "grad_norm": 0.1141452044248581, "learning_rate": 0.0001995357342850826, "loss": 0.0992, "num_input_tokens_seen": 2083696, "step": 240, "train_runtime": 1755.0335, "train_tokens_per_second": 1187.269 }, { "epoch": 0.1058715497218171, "grad_norm": 0.11164279282093048, "learning_rate": 0.00019950176140879668, "loss": 0.0877, "num_input_tokens_seen": 2127072, "step": 245, "train_runtime": 1777.3851, "train_tokens_per_second": 1196.742 }, { "epoch": 0.10803219359369091, "grad_norm": 0.09218638390302658, "learning_rate": 0.00019946659214000568, "loss": 0.0856, "num_input_tokens_seen": 2170448, "step": 250, "train_runtime": 1799.395, "train_tokens_per_second": 1206.21 }, { "epoch": 0.11019283746556474, "grad_norm": 0.09893018752336502, "learning_rate": 0.00019943022690157894, "loss": 0.0839, "num_input_tokens_seen": 2213648, "step": 255, "train_runtime": 1821.4888, "train_tokens_per_second": 1215.296 }, { "epoch": 0.11235348133743855, "grad_norm": 0.10116513073444366, "learning_rate": 0.00019939266613076603, "loss": 0.091, "num_input_tokens_seen": 2257296, "step": 260, "train_runtime": 1843.5073, "train_tokens_per_second": 1224.457 }, { "epoch": 0.11451412520931238, "grad_norm": 0.08388976752758026, "learning_rate": 0.00019935391027919134, "loss": 0.0855, "num_input_tokens_seen": 2300624, "step": 265, "train_runtime": 1864.8115, "train_tokens_per_second": 1233.703 }, { "epoch": 0.1166747690811862, "grad_norm": 0.08835554867982864, "learning_rate": 0.00019931395981284878, "loss": 0.0885, "num_input_tokens_seen": 2344096, "step": 270, "train_runtime": 1887.3294, "train_tokens_per_second": 1242.017 }, { "epoch": 0.11883541295306001, "grad_norm": 0.09479964524507523, "learning_rate": 0.000199272815212096, "loss": 0.0909, "num_input_tokens_seen": 2387696, "step": 275, "train_runtime": 1909.7108, "train_tokens_per_second": 1250.292 }, { "epoch": 0.12099605682493383, "grad_norm": 0.09753034263849258, "learning_rate": 0.00019923047697164884, "loss": 0.0937, "num_input_tokens_seen": 2431264, "step": 280, "train_runtime": 1931.1784, "train_tokens_per_second": 1258.954 }, { "epoch": 0.12315670069680765, "grad_norm": 0.09564550220966339, "learning_rate": 0.00019918694560057518, "loss": 0.0859, "num_input_tokens_seen": 2474928, "step": 285, "train_runtime": 1952.8165, "train_tokens_per_second": 1267.363 }, { "epoch": 0.12531734456868146, "grad_norm": 0.08517869561910629, "learning_rate": 0.0001991422216222889, "loss": 0.0814, "num_input_tokens_seen": 2517936, "step": 290, "train_runtime": 1973.8825, "train_tokens_per_second": 1275.626 }, { "epoch": 0.12747798844055527, "grad_norm": 0.09123244881629944, "learning_rate": 0.0001990963055745437, "loss": 0.0872, "num_input_tokens_seen": 2561312, "step": 295, "train_runtime": 1995.4942, "train_tokens_per_second": 1283.548 }, { "epoch": 0.12963863231242911, "grad_norm": 0.08103613555431366, "learning_rate": 0.0001990491980094264, "loss": 0.0811, "num_input_tokens_seen": 2604464, "step": 300, "train_runtime": 2016.775, "train_tokens_per_second": 1291.4 }, { "epoch": 0.13179927618430293, "grad_norm": 0.10851939767599106, "learning_rate": 0.00019900089949335042, "loss": 0.0964, "num_input_tokens_seen": 2648432, "step": 305, "train_runtime": 2039.7188, "train_tokens_per_second": 1298.43 }, { "epoch": 0.13395992005617674, "grad_norm": 0.08056960254907608, "learning_rate": 0.00019895141060704912, "loss": 0.0715, "num_input_tokens_seen": 2691472, "step": 310, "train_runtime": 2060.7107, "train_tokens_per_second": 1306.089 }, { "epoch": 0.13612056392805055, "grad_norm": 0.11214818060398102, "learning_rate": 0.0001989007319455685, "loss": 0.0839, "num_input_tokens_seen": 2734672, "step": 315, "train_runtime": 2082.0763, "train_tokens_per_second": 1313.435 }, { "epoch": 0.13828120779992437, "grad_norm": 0.11480865627527237, "learning_rate": 0.00019884886411826035, "loss": 0.0838, "num_input_tokens_seen": 2777792, "step": 320, "train_runtime": 2103.4799, "train_tokens_per_second": 1320.57 }, { "epoch": 0.1404418516717982, "grad_norm": 0.08724990487098694, "learning_rate": 0.0001987958077487747, "loss": 0.0846, "num_input_tokens_seen": 2821232, "step": 325, "train_runtime": 2125.391, "train_tokens_per_second": 1327.394 }, { "epoch": 0.14260249554367202, "grad_norm": 0.09756699949502945, "learning_rate": 0.00019874156347505242, "loss": 0.0902, "num_input_tokens_seen": 2864736, "step": 330, "train_runtime": 2147.0956, "train_tokens_per_second": 1334.238 }, { "epoch": 0.14476313941554583, "grad_norm": 0.08448482304811478, "learning_rate": 0.0001986861319493176, "loss": 0.0826, "num_input_tokens_seen": 2908048, "step": 335, "train_runtime": 2168.7932, "train_tokens_per_second": 1340.86 }, { "epoch": 0.14692378328741965, "grad_norm": 0.10293188691139221, "learning_rate": 0.0001986295138380696, "loss": 0.0816, "num_input_tokens_seen": 2951424, "step": 340, "train_runtime": 2189.9397, "train_tokens_per_second": 1347.719 }, { "epoch": 0.14908442715929346, "grad_norm": 0.09431219100952148, "learning_rate": 0.00019857170982207518, "loss": 0.097, "num_input_tokens_seen": 2995280, "step": 345, "train_runtime": 2211.8567, "train_tokens_per_second": 1354.193 }, { "epoch": 0.1512450710311673, "grad_norm": 0.08135256171226501, "learning_rate": 0.00019851272059636003, "loss": 0.0777, "num_input_tokens_seen": 3038272, "step": 350, "train_runtime": 2233.3418, "train_tokens_per_second": 1360.415 }, { "epoch": 0.1534057149030411, "grad_norm": 0.08785713464021683, "learning_rate": 0.00019845254687020077, "loss": 0.0881, "num_input_tokens_seen": 3081776, "step": 355, "train_runtime": 2254.994, "train_tokens_per_second": 1366.645 }, { "epoch": 0.15556635877491493, "grad_norm": 0.07563728839159012, "learning_rate": 0.0001983911893671162, "loss": 0.0737, "num_input_tokens_seen": 3124848, "step": 360, "train_runtime": 2277.7146, "train_tokens_per_second": 1371.923 }, { "epoch": 0.15772700264678874, "grad_norm": 0.10026325285434723, "learning_rate": 0.00019832864882485862, "loss": 0.0756, "num_input_tokens_seen": 3167904, "step": 365, "train_runtime": 2299.0524, "train_tokens_per_second": 1377.917 }, { "epoch": 0.15988764651866255, "grad_norm": 0.09677627682685852, "learning_rate": 0.00019826492599540508, "loss": 0.0805, "num_input_tokens_seen": 3210928, "step": 370, "train_runtime": 2320.9453, "train_tokens_per_second": 1383.457 }, { "epoch": 0.1620482903905364, "grad_norm": 0.1106276884675026, "learning_rate": 0.00019820002164494817, "loss": 0.0956, "num_input_tokens_seen": 3254384, "step": 375, "train_runtime": 2342.6293, "train_tokens_per_second": 1389.201 }, { "epoch": 0.1642089342624102, "grad_norm": 0.09862922132015228, "learning_rate": 0.00019813393655388696, "loss": 0.0758, "num_input_tokens_seen": 3297280, "step": 380, "train_runtime": 2364.2207, "train_tokens_per_second": 1394.658 }, { "epoch": 0.16636957813428402, "grad_norm": 0.08001670241355896, "learning_rate": 0.00019806667151681744, "loss": 0.087, "num_input_tokens_seen": 3340832, "step": 385, "train_runtime": 2386.4765, "train_tokens_per_second": 1399.901 }, { "epoch": 0.16853022200615783, "grad_norm": 0.10081043839454651, "learning_rate": 0.0001979982273425232, "loss": 0.0727, "num_input_tokens_seen": 3383856, "step": 390, "train_runtime": 2408.253, "train_tokens_per_second": 1405.108 }, { "epoch": 0.17069086587803164, "grad_norm": 0.10049381852149963, "learning_rate": 0.00019792860485396554, "loss": 0.0931, "num_input_tokens_seen": 3427360, "step": 395, "train_runtime": 2430.5513, "train_tokens_per_second": 1410.116 }, { "epoch": 0.17285150974990549, "grad_norm": 0.10096573084592819, "learning_rate": 0.00019785780488827356, "loss": 0.0926, "num_input_tokens_seen": 3470800, "step": 400, "train_runtime": 2451.9847, "train_tokens_per_second": 1415.506 }, { "epoch": 0.17285150974990549, "eval_loss": 0.0820649117231369, "eval_runtime": 711.7049, "eval_samples_per_second": 13.007, "eval_steps_per_second": 6.504, "num_input_tokens_seen": 3470800, "step": 400 }, { "epoch": 0.1750121536217793, "grad_norm": 0.06486905366182327, "learning_rate": 0.00019778582829673414, "loss": 0.0722, "num_input_tokens_seen": 3514048, "step": 405, "train_runtime": 3190.7217, "train_tokens_per_second": 1101.333 }, { "epoch": 0.1771727974936531, "grad_norm": 0.07102972269058228, "learning_rate": 0.00019771267594478184, "loss": 0.0814, "num_input_tokens_seen": 3557232, "step": 410, "train_runtime": 3212.7197, "train_tokens_per_second": 1107.234 }, { "epoch": 0.17933344136552692, "grad_norm": 0.0909392461180687, "learning_rate": 0.00019763834871198825, "loss": 0.0833, "num_input_tokens_seen": 3600528, "step": 415, "train_runtime": 3233.939, "train_tokens_per_second": 1113.357 }, { "epoch": 0.18149408523740074, "grad_norm": 0.10018228739500046, "learning_rate": 0.00019756284749205153, "loss": 0.0959, "num_input_tokens_seen": 3644144, "step": 420, "train_runtime": 3255.3066, "train_tokens_per_second": 1119.447 }, { "epoch": 0.18365472910927455, "grad_norm": 0.08901096135377884, "learning_rate": 0.00019748617319278577, "loss": 0.089, "num_input_tokens_seen": 3687856, "step": 425, "train_runtime": 3277.573, "train_tokens_per_second": 1125.179 }, { "epoch": 0.1858153729811484, "grad_norm": 0.07547247409820557, "learning_rate": 0.0001974083267361098, "loss": 0.0883, "num_input_tokens_seen": 3731552, "step": 430, "train_runtime": 3299.6823, "train_tokens_per_second": 1130.882 }, { "epoch": 0.1879760168530222, "grad_norm": 0.0945342481136322, "learning_rate": 0.00019732930905803643, "loss": 0.0807, "num_input_tokens_seen": 3774768, "step": 435, "train_runtime": 3322.0043, "train_tokens_per_second": 1136.292 }, { "epoch": 0.19013666072489602, "grad_norm": 0.09098955243825912, "learning_rate": 0.00019724912110866098, "loss": 0.0864, "num_input_tokens_seen": 3818432, "step": 440, "train_runtime": 3343.5489, "train_tokens_per_second": 1142.03 }, { "epoch": 0.19229730459676983, "grad_norm": 0.07744833081960678, "learning_rate": 0.0001971677638521499, "loss": 0.0795, "num_input_tokens_seen": 3861376, "step": 445, "train_runtime": 3365.195, "train_tokens_per_second": 1147.445 }, { "epoch": 0.19445794846864364, "grad_norm": 0.07406079024076462, "learning_rate": 0.0001970852382667292, "loss": 0.0846, "num_input_tokens_seen": 3905008, "step": 450, "train_runtime": 3386.6968, "train_tokens_per_second": 1153.043 }, { "epoch": 0.19661859234051748, "grad_norm": 0.08702688664197922, "learning_rate": 0.00019700154534467272, "loss": 0.0807, "num_input_tokens_seen": 3948368, "step": 455, "train_runtime": 3408.6979, "train_tokens_per_second": 1158.321 }, { "epoch": 0.1987792362123913, "grad_norm": 0.1021113395690918, "learning_rate": 0.0001969166860922901, "loss": 0.0839, "num_input_tokens_seen": 3992160, "step": 460, "train_runtime": 3430.6539, "train_tokens_per_second": 1163.673 }, { "epoch": 0.2009398800842651, "grad_norm": 0.06688612699508667, "learning_rate": 0.00019683066152991477, "loss": 0.0779, "num_input_tokens_seen": 4035088, "step": 465, "train_runtime": 3451.9765, "train_tokens_per_second": 1168.921 }, { "epoch": 0.20310052395613892, "grad_norm": 0.06544195860624313, "learning_rate": 0.00019674347269189172, "loss": 0.0798, "num_input_tokens_seen": 4078368, "step": 470, "train_runtime": 3473.2579, "train_tokens_per_second": 1174.22 }, { "epoch": 0.20526116782801274, "grad_norm": 0.0765095129609108, "learning_rate": 0.00019665512062656481, "loss": 0.077, "num_input_tokens_seen": 4121200, "step": 475, "train_runtime": 3494.1767, "train_tokens_per_second": 1179.448 }, { "epoch": 0.20742181169988658, "grad_norm": 0.09076128900051117, "learning_rate": 0.00019656560639626455, "loss": 0.0777, "num_input_tokens_seen": 4164272, "step": 480, "train_runtime": 3515.4286, "train_tokens_per_second": 1184.57 }, { "epoch": 0.2095824555717604, "grad_norm": 0.084928959608078, "learning_rate": 0.00019647493107729505, "loss": 0.0806, "num_input_tokens_seen": 4207440, "step": 485, "train_runtime": 3537.3488, "train_tokens_per_second": 1189.433 }, { "epoch": 0.2117430994436342, "grad_norm": 0.07026708126068115, "learning_rate": 0.0001963830957599211, "loss": 0.0812, "num_input_tokens_seen": 4250592, "step": 490, "train_runtime": 3559.4743, "train_tokens_per_second": 1194.163 }, { "epoch": 0.21390374331550802, "grad_norm": 0.08042414486408234, "learning_rate": 0.0001962901015483552, "loss": 0.0776, "num_input_tokens_seen": 4293824, "step": 495, "train_runtime": 3581.141, "train_tokens_per_second": 1199.01 }, { "epoch": 0.21606438718738183, "grad_norm": 0.10094469785690308, "learning_rate": 0.00019619594956074416, "loss": 0.0883, "num_input_tokens_seen": 4337648, "step": 500, "train_runtime": 3604.6947, "train_tokens_per_second": 1203.333 }, { "epoch": 0.21822503105925567, "grad_norm": 0.08260887116193771, "learning_rate": 0.0001961006409291557, "loss": 0.0728, "num_input_tokens_seen": 4380864, "step": 505, "train_runtime": 3626.825, "train_tokens_per_second": 1207.906 }, { "epoch": 0.22038567493112948, "grad_norm": 0.09819753468036652, "learning_rate": 0.00019600417679956485, "loss": 0.0995, "num_input_tokens_seen": 4425184, "step": 510, "train_runtime": 3649.4129, "train_tokens_per_second": 1212.574 }, { "epoch": 0.2225463188030033, "grad_norm": 0.08613722771406174, "learning_rate": 0.00019590655833184008, "loss": 0.0913, "num_input_tokens_seen": 4469072, "step": 515, "train_runtime": 3671.0009, "train_tokens_per_second": 1217.399 }, { "epoch": 0.2247069626748771, "grad_norm": 0.0694877877831459, "learning_rate": 0.00019580778669972958, "loss": 0.0776, "num_input_tokens_seen": 4512896, "step": 520, "train_runtime": 3692.8898, "train_tokens_per_second": 1222.05 }, { "epoch": 0.22686760654675092, "grad_norm": 0.07937192916870117, "learning_rate": 0.0001957078630908468, "loss": 0.0815, "num_input_tokens_seen": 4556272, "step": 525, "train_runtime": 3714.5571, "train_tokens_per_second": 1226.599 }, { "epoch": 0.22902825041862476, "grad_norm": 0.11298541724681854, "learning_rate": 0.00019560678870665657, "loss": 0.0931, "num_input_tokens_seen": 4600080, "step": 530, "train_runtime": 3736.33, "train_tokens_per_second": 1231.176 }, { "epoch": 0.23118889429049858, "grad_norm": 0.10687752813100815, "learning_rate": 0.00019550456476246026, "loss": 0.0846, "num_input_tokens_seen": 4643520, "step": 535, "train_runtime": 3757.6182, "train_tokens_per_second": 1235.762 }, { "epoch": 0.2333495381623724, "grad_norm": 0.0689394399523735, "learning_rate": 0.00019540119248738152, "loss": 0.0825, "num_input_tokens_seen": 4686992, "step": 540, "train_runtime": 3779.7487, "train_tokens_per_second": 1240.027 }, { "epoch": 0.2355101820342462, "grad_norm": 0.07380052655935287, "learning_rate": 0.00019529667312435123, "loss": 0.0718, "num_input_tokens_seen": 4729856, "step": 545, "train_runtime": 3802.086, "train_tokens_per_second": 1244.016 }, { "epoch": 0.23767082590612001, "grad_norm": 0.07186949253082275, "learning_rate": 0.00019519100793009267, "loss": 0.0728, "num_input_tokens_seen": 4773024, "step": 550, "train_runtime": 3823.73, "train_tokens_per_second": 1248.264 }, { "epoch": 0.23983146977799386, "grad_norm": 0.06932114064693451, "learning_rate": 0.00019508419817510647, "loss": 0.0742, "num_input_tokens_seen": 4815984, "step": 555, "train_runtime": 3844.7929, "train_tokens_per_second": 1252.599 }, { "epoch": 0.24199211364986767, "grad_norm": 0.08667387068271637, "learning_rate": 0.0001949762451436552, "loss": 0.0791, "num_input_tokens_seen": 4859168, "step": 560, "train_runtime": 3866.2134, "train_tokens_per_second": 1256.829 }, { "epoch": 0.24415275752174148, "grad_norm": 0.10014659911394119, "learning_rate": 0.00019486715013374803, "loss": 0.0818, "num_input_tokens_seen": 4902272, "step": 565, "train_runtime": 3887.7872, "train_tokens_per_second": 1260.941 }, { "epoch": 0.2463134013936153, "grad_norm": 0.06697220355272293, "learning_rate": 0.00019475691445712507, "loss": 0.07, "num_input_tokens_seen": 4945312, "step": 570, "train_runtime": 3909.6692, "train_tokens_per_second": 1264.893 }, { "epoch": 0.2484740452654891, "grad_norm": 0.09417334198951721, "learning_rate": 0.00019464553943924164, "loss": 0.0856, "num_input_tokens_seen": 4988832, "step": 575, "train_runtime": 3932.5558, "train_tokens_per_second": 1268.598 }, { "epoch": 0.2506346891373629, "grad_norm": 0.09484616667032242, "learning_rate": 0.00019453302641925227, "loss": 0.0806, "num_input_tokens_seen": 5032144, "step": 580, "train_runtime": 3954.2714, "train_tokens_per_second": 1272.584 }, { "epoch": 0.25279533300923673, "grad_norm": 0.07154662162065506, "learning_rate": 0.00019441937674999468, "loss": 0.0791, "num_input_tokens_seen": 5075712, "step": 585, "train_runtime": 3976.0177, "train_tokens_per_second": 1276.582 }, { "epoch": 0.25495597688111055, "grad_norm": 0.08197880536317825, "learning_rate": 0.00019430459179797343, "loss": 0.0776, "num_input_tokens_seen": 5118784, "step": 590, "train_runtime": 3998.0938, "train_tokens_per_second": 1280.306 }, { "epoch": 0.2571166207529844, "grad_norm": 0.09517450630664825, "learning_rate": 0.00019418867294334355, "loss": 0.0783, "num_input_tokens_seen": 5162224, "step": 595, "train_runtime": 4019.3148, "train_tokens_per_second": 1284.354 }, { "epoch": 0.25927726462485823, "grad_norm": 0.07223788648843765, "learning_rate": 0.00019407162157989393, "loss": 0.0734, "num_input_tokens_seen": 5205120, "step": 600, "train_runtime": 4040.9479, "train_tokens_per_second": 1288.094 }, { "epoch": 0.25927726462485823, "eval_loss": 0.07911964505910873, "eval_runtime": 711.8752, "eval_samples_per_second": 13.004, "eval_steps_per_second": 6.503, "num_input_tokens_seen": 5205120, "step": 600 }, { "epoch": 0.26143790849673204, "grad_norm": 0.09286199510097504, "learning_rate": 0.00019395343911503057, "loss": 0.0843, "num_input_tokens_seen": 5248688, "step": 605, "train_runtime": 4780.1535, "train_tokens_per_second": 1098.017 }, { "epoch": 0.26359855236860585, "grad_norm": 0.08078984916210175, "learning_rate": 0.00019383412696975956, "loss": 0.0788, "num_input_tokens_seen": 5292032, "step": 610, "train_runtime": 4801.333, "train_tokens_per_second": 1102.201 }, { "epoch": 0.26575919624047967, "grad_norm": 0.07582154124975204, "learning_rate": 0.0001937136865786702, "loss": 0.0876, "num_input_tokens_seen": 5335920, "step": 615, "train_runtime": 4824.0, "train_tokens_per_second": 1106.119 }, { "epoch": 0.2679198401123535, "grad_norm": 0.0737927258014679, "learning_rate": 0.00019359211938991755, "loss": 0.0784, "num_input_tokens_seen": 5379328, "step": 620, "train_runtime": 4845.4042, "train_tokens_per_second": 1110.192 }, { "epoch": 0.2700804839842273, "grad_norm": 0.09328042715787888, "learning_rate": 0.0001934694268652051, "loss": 0.081, "num_input_tokens_seen": 5422832, "step": 625, "train_runtime": 4867.5202, "train_tokens_per_second": 1114.085 }, { "epoch": 0.2722411278561011, "grad_norm": 0.07202576845884323, "learning_rate": 0.00019334561047976723, "loss": 0.0837, "num_input_tokens_seen": 5466160, "step": 630, "train_runtime": 4888.8853, "train_tokens_per_second": 1118.079 }, { "epoch": 0.2744017717279749, "grad_norm": 0.07730934768915176, "learning_rate": 0.00019322067172235138, "loss": 0.0864, "num_input_tokens_seen": 5509760, "step": 635, "train_runtime": 4910.512, "train_tokens_per_second": 1122.034 }, { "epoch": 0.27656241559984873, "grad_norm": 0.06718683242797852, "learning_rate": 0.0001930946120952003, "loss": 0.0664, "num_input_tokens_seen": 5552416, "step": 640, "train_runtime": 4931.3543, "train_tokens_per_second": 1125.941 }, { "epoch": 0.2787230594717226, "grad_norm": 0.07979665696620941, "learning_rate": 0.00019296743311403376, "loss": 0.0909, "num_input_tokens_seen": 5595936, "step": 645, "train_runtime": 4952.8952, "train_tokens_per_second": 1129.831 }, { "epoch": 0.2808837033435964, "grad_norm": 0.08611753582954407, "learning_rate": 0.00019283913630803055, "loss": 0.0883, "num_input_tokens_seen": 5639392, "step": 650, "train_runtime": 4974.7226, "train_tokens_per_second": 1133.609 }, { "epoch": 0.2830443472154702, "grad_norm": 0.0799168050289154, "learning_rate": 0.00019270972321980991, "loss": 0.0816, "num_input_tokens_seen": 5682688, "step": 655, "train_runtime": 4995.9321, "train_tokens_per_second": 1137.463 }, { "epoch": 0.28520499108734404, "grad_norm": 0.0729471817612648, "learning_rate": 0.0001925791954054132, "loss": 0.0741, "num_input_tokens_seen": 5725632, "step": 660, "train_runtime": 5017.5209, "train_tokens_per_second": 1141.128 }, { "epoch": 0.28736563495921785, "grad_norm": 0.10275959223508835, "learning_rate": 0.00019244755443428494, "loss": 0.0841, "num_input_tokens_seen": 5769488, "step": 665, "train_runtime": 5039.1393, "train_tokens_per_second": 1144.935 }, { "epoch": 0.28952627883109167, "grad_norm": 0.0760921835899353, "learning_rate": 0.00019231480188925412, "loss": 0.0833, "num_input_tokens_seen": 5812736, "step": 670, "train_runtime": 5060.9978, "train_tokens_per_second": 1148.536 }, { "epoch": 0.2916869227029655, "grad_norm": 0.0875801295042038, "learning_rate": 0.0001921809393665151, "loss": 0.0817, "num_input_tokens_seen": 5856224, "step": 675, "train_runtime": 5082.9523, "train_tokens_per_second": 1152.13 }, { "epoch": 0.2938475665748393, "grad_norm": 0.07346130162477493, "learning_rate": 0.0001920459684756084, "loss": 0.0815, "num_input_tokens_seen": 5899728, "step": 680, "train_runtime": 5104.4384, "train_tokens_per_second": 1155.804 }, { "epoch": 0.2960082104467131, "grad_norm": 0.07405713200569153, "learning_rate": 0.00019190989083940144, "loss": 0.08, "num_input_tokens_seen": 5943184, "step": 685, "train_runtime": 5126.9699, "train_tokens_per_second": 1159.2 }, { "epoch": 0.2981688543185869, "grad_norm": 0.074583999812603, "learning_rate": 0.00019177270809406886, "loss": 0.0753, "num_input_tokens_seen": 5986528, "step": 690, "train_runtime": 5148.3756, "train_tokens_per_second": 1162.799 }, { "epoch": 0.30032949819046073, "grad_norm": 0.08363614976406097, "learning_rate": 0.00019163442188907306, "loss": 0.0789, "num_input_tokens_seen": 6029792, "step": 695, "train_runtime": 5170.5975, "train_tokens_per_second": 1166.169 }, { "epoch": 0.3024901420623346, "grad_norm": 0.07529831677675247, "learning_rate": 0.00019149503388714414, "loss": 0.0782, "num_input_tokens_seen": 6072960, "step": 700, "train_runtime": 5192.0761, "train_tokens_per_second": 1169.659 }, { "epoch": 0.3046507859342084, "grad_norm": 0.08652273565530777, "learning_rate": 0.0001913545457642601, "loss": 0.0859, "num_input_tokens_seen": 6116880, "step": 705, "train_runtime": 5214.4006, "train_tokens_per_second": 1173.074 }, { "epoch": 0.3068114298060822, "grad_norm": 0.09629788249731064, "learning_rate": 0.00019121295920962662, "loss": 0.0767, "num_input_tokens_seen": 6160256, "step": 710, "train_runtime": 5235.856, "train_tokens_per_second": 1176.552 }, { "epoch": 0.30897207367795604, "grad_norm": 0.06942661106586456, "learning_rate": 0.00019107027592565662, "loss": 0.0814, "num_input_tokens_seen": 6203664, "step": 715, "train_runtime": 5258.1546, "train_tokens_per_second": 1179.818 }, { "epoch": 0.31113271754982985, "grad_norm": 0.05736853554844856, "learning_rate": 0.00019092649762795009, "loss": 0.0758, "num_input_tokens_seen": 6246864, "step": 720, "train_runtime": 5279.6223, "train_tokens_per_second": 1183.203 }, { "epoch": 0.31329336142170366, "grad_norm": 0.08216078579425812, "learning_rate": 0.00019078162604527313, "loss": 0.0786, "num_input_tokens_seen": 6290208, "step": 725, "train_runtime": 5301.567, "train_tokens_per_second": 1186.481 }, { "epoch": 0.3154540052935775, "grad_norm": 0.07779684662818909, "learning_rate": 0.00019063566291953739, "loss": 0.0706, "num_input_tokens_seen": 6333120, "step": 730, "train_runtime": 5323.539, "train_tokens_per_second": 1189.645 }, { "epoch": 0.3176146491654513, "grad_norm": 0.0775391012430191, "learning_rate": 0.00019048861000577904, "loss": 0.0763, "num_input_tokens_seen": 6376640, "step": 735, "train_runtime": 5346.0257, "train_tokens_per_second": 1192.781 }, { "epoch": 0.3197752930373251, "grad_norm": 0.08086078613996506, "learning_rate": 0.00019034046907213768, "loss": 0.086, "num_input_tokens_seen": 6420272, "step": 740, "train_runtime": 5367.655, "train_tokens_per_second": 1196.104 }, { "epoch": 0.3219359369091989, "grad_norm": 0.10933763533830643, "learning_rate": 0.00019019124189983502, "loss": 0.0865, "num_input_tokens_seen": 6464288, "step": 745, "train_runtime": 5389.0206, "train_tokens_per_second": 1199.529 }, { "epoch": 0.3240965807810728, "grad_norm": 0.07312079519033432, "learning_rate": 0.00019004093028315367, "loss": 0.0734, "num_input_tokens_seen": 6507472, "step": 750, "train_runtime": 5410.7467, "train_tokens_per_second": 1202.694 }, { "epoch": 0.3262572246529466, "grad_norm": 0.09135115891695023, "learning_rate": 0.00018988953602941522, "loss": 0.0857, "num_input_tokens_seen": 6551152, "step": 755, "train_runtime": 5432.6461, "train_tokens_per_second": 1205.886 }, { "epoch": 0.3284178685248204, "grad_norm": 0.07926656305789948, "learning_rate": 0.00018973706095895887, "loss": 0.0801, "num_input_tokens_seen": 6594464, "step": 760, "train_runtime": 5453.9931, "train_tokens_per_second": 1209.107 }, { "epoch": 0.3305785123966942, "grad_norm": 0.07842066138982773, "learning_rate": 0.00018958350690511928, "loss": 0.0708, "num_input_tokens_seen": 6637648, "step": 765, "train_runtime": 5475.6934, "train_tokens_per_second": 1212.202 }, { "epoch": 0.33273915626856804, "grad_norm": 0.062182243913412094, "learning_rate": 0.00018942887571420469, "loss": 0.0734, "num_input_tokens_seen": 6680960, "step": 770, "train_runtime": 5496.9379, "train_tokens_per_second": 1215.397 }, { "epoch": 0.33489980014044185, "grad_norm": 0.08400420844554901, "learning_rate": 0.0001892731692454746, "loss": 0.0793, "num_input_tokens_seen": 6724688, "step": 775, "train_runtime": 5519.4401, "train_tokens_per_second": 1218.364 }, { "epoch": 0.33706044401231566, "grad_norm": 0.07636286318302155, "learning_rate": 0.0001891163893711175, "loss": 0.0746, "num_input_tokens_seen": 6767856, "step": 780, "train_runtime": 5541.114, "train_tokens_per_second": 1221.389 }, { "epoch": 0.3392210878841895, "grad_norm": 0.08771245926618576, "learning_rate": 0.00018895853797622837, "loss": 0.0856, "num_input_tokens_seen": 6811408, "step": 785, "train_runtime": 5562.918, "train_tokens_per_second": 1224.431 }, { "epoch": 0.3413817317560633, "grad_norm": 0.07764877378940582, "learning_rate": 0.00018879961695878586, "loss": 0.0814, "num_input_tokens_seen": 6854928, "step": 790, "train_runtime": 5584.5564, "train_tokens_per_second": 1227.479 }, { "epoch": 0.3435423756279371, "grad_norm": 0.07771775126457214, "learning_rate": 0.00018863962822962974, "loss": 0.076, "num_input_tokens_seen": 6898064, "step": 795, "train_runtime": 5606.9982, "train_tokens_per_second": 1230.26 }, { "epoch": 0.34570301949981097, "grad_norm": 0.07678196579217911, "learning_rate": 0.00018847857371243762, "loss": 0.0772, "num_input_tokens_seen": 6941760, "step": 800, "train_runtime": 5628.6983, "train_tokens_per_second": 1233.28 }, { "epoch": 0.34570301949981097, "eval_loss": 0.07707177847623825, "eval_runtime": 710.7395, "eval_samples_per_second": 13.024, "eval_steps_per_second": 6.513, "num_input_tokens_seen": 6941760, "step": 800 }, { "epoch": 0.3478636633716848, "grad_norm": 0.0912775844335556, "learning_rate": 0.000188316455343702, "loss": 0.0762, "num_input_tokens_seen": 6984928, "step": 805, "train_runtime": 6366.9321, "train_tokens_per_second": 1097.063 }, { "epoch": 0.3500243072435586, "grad_norm": 0.08312050998210907, "learning_rate": 0.00018815327507270703, "loss": 0.0817, "num_input_tokens_seen": 7028864, "step": 810, "train_runtime": 6390.056, "train_tokens_per_second": 1099.969 }, { "epoch": 0.3521849511154324, "grad_norm": 0.09067723900079727, "learning_rate": 0.00018798903486150494, "loss": 0.0858, "num_input_tokens_seen": 7072528, "step": 815, "train_runtime": 6412.3692, "train_tokens_per_second": 1102.951 }, { "epoch": 0.3543455949873062, "grad_norm": 0.09034962207078934, "learning_rate": 0.0001878237366848925, "loss": 0.0811, "num_input_tokens_seen": 7116176, "step": 820, "train_runtime": 6434.2034, "train_tokens_per_second": 1105.992 }, { "epoch": 0.35650623885918004, "grad_norm": 0.07377108186483383, "learning_rate": 0.00018765738253038726, "loss": 0.0759, "num_input_tokens_seen": 7159504, "step": 825, "train_runtime": 6455.9184, "train_tokens_per_second": 1108.983 }, { "epoch": 0.35866688273105385, "grad_norm": 0.08932390064001083, "learning_rate": 0.00018748997439820372, "loss": 0.0686, "num_input_tokens_seen": 7202400, "step": 830, "train_runtime": 6478.1722, "train_tokens_per_second": 1111.795 }, { "epoch": 0.36082752660292766, "grad_norm": 0.06967565417289734, "learning_rate": 0.0001873215143012292, "loss": 0.0763, "num_input_tokens_seen": 7245616, "step": 835, "train_runtime": 6499.9987, "train_tokens_per_second": 1114.71 }, { "epoch": 0.3629881704748015, "grad_norm": 0.08724388480186462, "learning_rate": 0.00018715200426499973, "loss": 0.0818, "num_input_tokens_seen": 7289264, "step": 840, "train_runtime": 6521.9735, "train_tokens_per_second": 1117.647 }, { "epoch": 0.3651488143466753, "grad_norm": 0.06765586882829666, "learning_rate": 0.00018698144632767547, "loss": 0.08, "num_input_tokens_seen": 7332992, "step": 845, "train_runtime": 6544.5412, "train_tokens_per_second": 1120.475 }, { "epoch": 0.3673094582185491, "grad_norm": 0.08761299401521683, "learning_rate": 0.00018680984254001656, "loss": 0.0814, "num_input_tokens_seen": 7376768, "step": 850, "train_runtime": 6566.4346, "train_tokens_per_second": 1123.405 }, { "epoch": 0.36947010209042297, "grad_norm": 0.0771804228425026, "learning_rate": 0.00018663719496535817, "loss": 0.0815, "num_input_tokens_seen": 7420656, "step": 855, "train_runtime": 6588.7311, "train_tokens_per_second": 1126.265 }, { "epoch": 0.3716307459622968, "grad_norm": 0.0810341015458107, "learning_rate": 0.00018646350567958582, "loss": 0.0794, "num_input_tokens_seen": 7463984, "step": 860, "train_runtime": 6610.4567, "train_tokens_per_second": 1129.118 }, { "epoch": 0.3737913898341706, "grad_norm": 0.07731346040964127, "learning_rate": 0.0001862887767711103, "loss": 0.0755, "num_input_tokens_seen": 7507264, "step": 865, "train_runtime": 6631.9401, "train_tokens_per_second": 1131.986 }, { "epoch": 0.3759520337060444, "grad_norm": 0.06987843662500381, "learning_rate": 0.00018611301034084283, "loss": 0.0796, "num_input_tokens_seen": 7550544, "step": 870, "train_runtime": 6653.1898, "train_tokens_per_second": 1134.876 }, { "epoch": 0.3781126775779182, "grad_norm": 0.08283229172229767, "learning_rate": 0.00018593620850216943, "loss": 0.0909, "num_input_tokens_seen": 7594592, "step": 875, "train_runtime": 6674.6719, "train_tokens_per_second": 1137.823 }, { "epoch": 0.38027332144979203, "grad_norm": 0.0696164071559906, "learning_rate": 0.00018575837338092582, "loss": 0.0745, "num_input_tokens_seen": 7637744, "step": 880, "train_runtime": 6696.8868, "train_tokens_per_second": 1140.492 }, { "epoch": 0.38243396532166585, "grad_norm": 0.07360994070768356, "learning_rate": 0.00018557950711537173, "loss": 0.0815, "num_input_tokens_seen": 7681232, "step": 885, "train_runtime": 6719.3031, "train_tokens_per_second": 1143.159 }, { "epoch": 0.38459460919353966, "grad_norm": 0.07119850069284439, "learning_rate": 0.0001853996118561651, "loss": 0.0732, "num_input_tokens_seen": 7724352, "step": 890, "train_runtime": 6741.235, "train_tokens_per_second": 1145.836 }, { "epoch": 0.3867552530654135, "grad_norm": 0.06476875394582748, "learning_rate": 0.0001852186897663364, "loss": 0.0695, "num_input_tokens_seen": 7767408, "step": 895, "train_runtime": 6762.9413, "train_tokens_per_second": 1148.525 }, { "epoch": 0.3889158969372873, "grad_norm": 0.07978302985429764, "learning_rate": 0.00018503674302126254, "loss": 0.0743, "num_input_tokens_seen": 7810688, "step": 900, "train_runtime": 6785.1546, "train_tokens_per_second": 1151.144 }, { "epoch": 0.39107654080916116, "grad_norm": 0.05432932823896408, "learning_rate": 0.00018485377380864069, "loss": 0.0769, "num_input_tokens_seen": 7854352, "step": 905, "train_runtime": 6808.2135, "train_tokens_per_second": 1153.658 }, { "epoch": 0.39323718468103497, "grad_norm": 0.08450411260128021, "learning_rate": 0.00018466978432846198, "loss": 0.0813, "num_input_tokens_seen": 7897824, "step": 910, "train_runtime": 6829.4279, "train_tokens_per_second": 1156.44 }, { "epoch": 0.3953978285529088, "grad_norm": 0.08708320558071136, "learning_rate": 0.00018448477679298508, "loss": 0.0828, "num_input_tokens_seen": 7941424, "step": 915, "train_runtime": 6851.7426, "train_tokens_per_second": 1159.037 }, { "epoch": 0.3975584724247826, "grad_norm": 0.07201456278562546, "learning_rate": 0.00018429875342670964, "loss": 0.0742, "num_input_tokens_seen": 7984608, "step": 920, "train_runtime": 6873.2801, "train_tokens_per_second": 1161.688 }, { "epoch": 0.3997191162966564, "grad_norm": 0.07260388880968094, "learning_rate": 0.00018411171646634937, "loss": 0.0744, "num_input_tokens_seen": 8028192, "step": 925, "train_runtime": 6894.4722, "train_tokens_per_second": 1164.439 }, { "epoch": 0.4018797601685302, "grad_norm": 0.08266861736774445, "learning_rate": 0.00018392366816080542, "loss": 0.0794, "num_input_tokens_seen": 8071584, "step": 930, "train_runtime": 6916.4175, "train_tokens_per_second": 1167.018 }, { "epoch": 0.40404040404040403, "grad_norm": 0.08442539721727371, "learning_rate": 0.00018373461077113908, "loss": 0.0837, "num_input_tokens_seen": 8115056, "step": 935, "train_runtime": 6938.0681, "train_tokens_per_second": 1169.642 }, { "epoch": 0.40620104791227785, "grad_norm": 0.07598377764225006, "learning_rate": 0.00018354454657054469, "loss": 0.0806, "num_input_tokens_seen": 8158976, "step": 940, "train_runtime": 6959.6868, "train_tokens_per_second": 1172.319 }, { "epoch": 0.40836169178415166, "grad_norm": 0.06979737430810928, "learning_rate": 0.00018335347784432236, "loss": 0.0846, "num_input_tokens_seen": 8203008, "step": 945, "train_runtime": 6981.6345, "train_tokens_per_second": 1174.941 }, { "epoch": 0.41052233565602547, "grad_norm": 0.08268395811319351, "learning_rate": 0.00018316140688985047, "loss": 0.0813, "num_input_tokens_seen": 8246112, "step": 950, "train_runtime": 7002.7827, "train_tokens_per_second": 1177.548 }, { "epoch": 0.41268297952789934, "grad_norm": 0.0866621881723404, "learning_rate": 0.00018296833601655794, "loss": 0.0759, "num_input_tokens_seen": 8289408, "step": 955, "train_runtime": 7024.109, "train_tokens_per_second": 1180.137 }, { "epoch": 0.41484362339977315, "grad_norm": 0.0772893950343132, "learning_rate": 0.0001827742675458966, "loss": 0.0768, "num_input_tokens_seen": 8332832, "step": 960, "train_runtime": 7045.8829, "train_tokens_per_second": 1182.653 }, { "epoch": 0.41700426727164697, "grad_norm": 0.07792758196592331, "learning_rate": 0.00018257920381131327, "loss": 0.0824, "num_input_tokens_seen": 8376720, "step": 965, "train_runtime": 7067.7123, "train_tokens_per_second": 1185.21 }, { "epoch": 0.4191649111435208, "grad_norm": 0.07842139154672623, "learning_rate": 0.00018238314715822158, "loss": 0.0836, "num_input_tokens_seen": 8420304, "step": 970, "train_runtime": 7089.4634, "train_tokens_per_second": 1187.721 }, { "epoch": 0.4213255550153946, "grad_norm": 0.07367521524429321, "learning_rate": 0.00018218609994397387, "loss": 0.0838, "num_input_tokens_seen": 8463904, "step": 975, "train_runtime": 7111.0203, "train_tokens_per_second": 1190.252 }, { "epoch": 0.4234861988872684, "grad_norm": 0.06106347218155861, "learning_rate": 0.0001819880645378328, "loss": 0.0787, "num_input_tokens_seen": 8507328, "step": 980, "train_runtime": 7133.5273, "train_tokens_per_second": 1192.584 }, { "epoch": 0.4256468427591422, "grad_norm": 0.04488658905029297, "learning_rate": 0.00018178904332094293, "loss": 0.0685, "num_input_tokens_seen": 8550368, "step": 985, "train_runtime": 7155.5568, "train_tokens_per_second": 1194.927 }, { "epoch": 0.42780748663101603, "grad_norm": 0.0789346843957901, "learning_rate": 0.00018158903868630203, "loss": 0.0916, "num_input_tokens_seen": 8594080, "step": 990, "train_runtime": 7177.2878, "train_tokens_per_second": 1197.399 }, { "epoch": 0.42996813050288984, "grad_norm": 0.08458510786294937, "learning_rate": 0.0001813880530387323, "loss": 0.0702, "num_input_tokens_seen": 8637200, "step": 995, "train_runtime": 7199.493, "train_tokens_per_second": 1199.696 }, { "epoch": 0.43212877437476366, "grad_norm": 0.07093961536884308, "learning_rate": 0.0001811860887948515, "loss": 0.0763, "num_input_tokens_seen": 8680800, "step": 1000, "train_runtime": 7222.5962, "train_tokens_per_second": 1201.895 }, { "epoch": 0.43212877437476366, "eval_loss": 0.07520591467618942, "eval_runtime": 12182.4571, "eval_samples_per_second": 0.76, "eval_steps_per_second": 0.38, "num_input_tokens_seen": 8680800, "step": 1000 }, { "epoch": 0.43428941824663747, "grad_norm": 0.08109795302152634, "learning_rate": 0.00018098314838304382, "loss": 0.0744, "num_input_tokens_seen": 8724480, "step": 1005, "train_runtime": 19429.5699, "train_tokens_per_second": 449.031 }, { "epoch": 0.43645006211851134, "grad_norm": 0.0698382705450058, "learning_rate": 0.00018077923424343083, "loss": 0.0702, "num_input_tokens_seen": 8767712, "step": 1010, "train_runtime": 19451.8383, "train_tokens_per_second": 450.74 }, { "epoch": 0.43861070599038515, "grad_norm": 0.07674521207809448, "learning_rate": 0.00018057434882784188, "loss": 0.0738, "num_input_tokens_seen": 8811312, "step": 1015, "train_runtime": 19473.1463, "train_tokens_per_second": 452.485 }, { "epoch": 0.44077134986225897, "grad_norm": 0.07470937073230743, "learning_rate": 0.00018036849459978493, "loss": 0.0746, "num_input_tokens_seen": 8854448, "step": 1020, "train_runtime": 19494.278, "train_tokens_per_second": 454.208 }, { "epoch": 0.4429319937341328, "grad_norm": 0.06504765897989273, "learning_rate": 0.00018016167403441674, "loss": 0.0734, "num_input_tokens_seen": 8897664, "step": 1025, "train_runtime": 19515.6756, "train_tokens_per_second": 455.924 }, { "epoch": 0.4450926376060066, "grad_norm": 0.07647648453712463, "learning_rate": 0.00017995388961851308, "loss": 0.0818, "num_input_tokens_seen": 8941408, "step": 1030, "train_runtime": 19537.3869, "train_tokens_per_second": 457.656 }, { "epoch": 0.4472532814778804, "grad_norm": 0.08472959697246552, "learning_rate": 0.00017974514385043897, "loss": 0.0793, "num_input_tokens_seen": 8984800, "step": 1035, "train_runtime": 19558.7936, "train_tokens_per_second": 459.374 }, { "epoch": 0.4494139253497542, "grad_norm": 0.06375865638256073, "learning_rate": 0.00017953543924011854, "loss": 0.0807, "num_input_tokens_seen": 9028528, "step": 1040, "train_runtime": 19580.2573, "train_tokens_per_second": 461.104 }, { "epoch": 0.45157456922162803, "grad_norm": 0.06762372702360153, "learning_rate": 0.00017932477830900494, "loss": 0.0756, "num_input_tokens_seen": 9071760, "step": 1045, "train_runtime": 19602.2512, "train_tokens_per_second": 462.792 }, { "epoch": 0.45373521309350184, "grad_norm": 0.08145523816347122, "learning_rate": 0.00017911316359004982, "loss": 0.0806, "num_input_tokens_seen": 9115312, "step": 1050, "train_runtime": 19624.3829, "train_tokens_per_second": 464.489 }, { "epoch": 0.45589585696537566, "grad_norm": 0.0655316486954689, "learning_rate": 0.0001789005976276731, "loss": 0.0742, "num_input_tokens_seen": 9158656, "step": 1055, "train_runtime": 19646.0519, "train_tokens_per_second": 466.183 }, { "epoch": 0.4580565008372495, "grad_norm": 0.07092972844839096, "learning_rate": 0.00017868708297773237, "loss": 0.0747, "num_input_tokens_seen": 9201744, "step": 1060, "train_runtime": 19667.4653, "train_tokens_per_second": 467.866 }, { "epoch": 0.46021714470912334, "grad_norm": 0.07497821748256683, "learning_rate": 0.00017847262220749196, "loss": 0.0809, "num_input_tokens_seen": 9245328, "step": 1065, "train_runtime": 19688.7343, "train_tokens_per_second": 469.575 }, { "epoch": 0.46237778858099715, "grad_norm": 0.07463043928146362, "learning_rate": 0.00017825721789559217, "loss": 0.0782, "num_input_tokens_seen": 9288800, "step": 1070, "train_runtime": 19710.406, "train_tokens_per_second": 471.264 }, { "epoch": 0.46453843245287096, "grad_norm": 0.050223931670188904, "learning_rate": 0.00017804087263201845, "loss": 0.0772, "num_input_tokens_seen": 9332304, "step": 1075, "train_runtime": 19731.8737, "train_tokens_per_second": 472.956 }, { "epoch": 0.4666990763247448, "grad_norm": 0.07988058030605316, "learning_rate": 0.00017782358901806994, "loss": 0.0755, "num_input_tokens_seen": 9375888, "step": 1080, "train_runtime": 19753.6903, "train_tokens_per_second": 474.64 }, { "epoch": 0.4688597201966186, "grad_norm": 0.07957769185304642, "learning_rate": 0.00017760536966632842, "loss": 0.0817, "num_input_tokens_seen": 9419248, "step": 1085, "train_runtime": 19775.1655, "train_tokens_per_second": 476.317 }, { "epoch": 0.4710203640684924, "grad_norm": 0.059474822133779526, "learning_rate": 0.0001773862172006268, "loss": 0.0788, "num_input_tokens_seen": 9462496, "step": 1090, "train_runtime": 19797.2478, "train_tokens_per_second": 477.97 }, { "epoch": 0.4731810079403662, "grad_norm": 0.08469399064779282, "learning_rate": 0.00017716613425601763, "loss": 0.0823, "num_input_tokens_seen": 9506512, "step": 1095, "train_runtime": 19819.423, "train_tokens_per_second": 479.656 }, { "epoch": 0.47534165181224003, "grad_norm": 0.07042556256055832, "learning_rate": 0.00017694512347874133, "loss": 0.0781, "num_input_tokens_seen": 9550080, "step": 1100, "train_runtime": 19841.9888, "train_tokens_per_second": 481.307 }, { "epoch": 0.47750229568411384, "grad_norm": 0.06320291012525558, "learning_rate": 0.0001767231875261944, "loss": 0.0753, "num_input_tokens_seen": 9593424, "step": 1105, "train_runtime": 19863.9008, "train_tokens_per_second": 482.958 }, { "epoch": 0.4796629395559877, "grad_norm": 0.06841768324375153, "learning_rate": 0.00017650032906689763, "loss": 0.071, "num_input_tokens_seen": 9636816, "step": 1110, "train_runtime": 19886.1652, "train_tokens_per_second": 484.599 }, { "epoch": 0.4818235834278615, "grad_norm": 0.07111469656229019, "learning_rate": 0.00017627655078046375, "loss": 0.0811, "num_input_tokens_seen": 9680368, "step": 1115, "train_runtime": 19908.3539, "train_tokens_per_second": 486.247 }, { "epoch": 0.48398422729973534, "grad_norm": 0.08030956983566284, "learning_rate": 0.00017605185535756536, "loss": 0.0729, "num_input_tokens_seen": 9723472, "step": 1120, "train_runtime": 19930.5485, "train_tokens_per_second": 487.868 }, { "epoch": 0.48614487117160915, "grad_norm": 0.06738725304603577, "learning_rate": 0.0001758262454999026, "loss": 0.07, "num_input_tokens_seen": 9766768, "step": 1125, "train_runtime": 19952.8417, "train_tokens_per_second": 489.493 }, { "epoch": 0.48830551504348296, "grad_norm": 0.053996093571186066, "learning_rate": 0.00017559972392017058, "loss": 0.0758, "num_input_tokens_seen": 9810240, "step": 1130, "train_runtime": 19974.4015, "train_tokens_per_second": 491.141 }, { "epoch": 0.4904661589153568, "grad_norm": 0.08248726278543472, "learning_rate": 0.00017537229334202683, "loss": 0.0854, "num_input_tokens_seen": 9853920, "step": 1135, "train_runtime": 19996.79, "train_tokens_per_second": 492.775 }, { "epoch": 0.4926268027872306, "grad_norm": 0.05713077634572983, "learning_rate": 0.0001751439565000585, "loss": 0.0726, "num_input_tokens_seen": 9897120, "step": 1140, "train_runtime": 20018.8567, "train_tokens_per_second": 494.39 }, { "epoch": 0.4947874466591044, "grad_norm": 0.07025758922100067, "learning_rate": 0.00017491471613974947, "loss": 0.0817, "num_input_tokens_seen": 9940432, "step": 1145, "train_runtime": 20039.9461, "train_tokens_per_second": 496.031 }, { "epoch": 0.4969480905309782, "grad_norm": 0.07322239875793457, "learning_rate": 0.00017468457501744749, "loss": 0.0854, "num_input_tokens_seen": 9984032, "step": 1150, "train_runtime": 20061.3593, "train_tokens_per_second": 497.675 }, { "epoch": 0.49910873440285203, "grad_norm": 0.07295897603034973, "learning_rate": 0.0001744535359003308, "loss": 0.0833, "num_input_tokens_seen": 10028048, "step": 1155, "train_runtime": 20082.8787, "train_tokens_per_second": 499.333 }, { "epoch": 0.5012693782747258, "grad_norm": 0.07253481447696686, "learning_rate": 0.00017422160156637507, "loss": 0.0741, "num_input_tokens_seen": 10071520, "step": 1160, "train_runtime": 20104.528, "train_tokens_per_second": 500.958 }, { "epoch": 0.5034300221465997, "grad_norm": 0.06147943064570427, "learning_rate": 0.0001739887748043198, "loss": 0.0712, "num_input_tokens_seen": 10114720, "step": 1165, "train_runtime": 20126.5851, "train_tokens_per_second": 502.555 }, { "epoch": 0.5055906660184735, "grad_norm": 0.07576042413711548, "learning_rate": 0.00017375505841363503, "loss": 0.0786, "num_input_tokens_seen": 10158080, "step": 1170, "train_runtime": 20148.2729, "train_tokens_per_second": 504.166 }, { "epoch": 0.5077513098903473, "grad_norm": 0.0773998275399208, "learning_rate": 0.00017352045520448742, "loss": 0.0734, "num_input_tokens_seen": 10201312, "step": 1175, "train_runtime": 20169.9853, "train_tokens_per_second": 505.767 }, { "epoch": 0.5099119537622211, "grad_norm": 0.0845336839556694, "learning_rate": 0.0001732849679977067, "loss": 0.0742, "num_input_tokens_seen": 10244464, "step": 1180, "train_runtime": 20192.6643, "train_tokens_per_second": 507.336 }, { "epoch": 0.512072597634095, "grad_norm": 0.059365056455135345, "learning_rate": 0.00017304859962475152, "loss": 0.0667, "num_input_tokens_seen": 10287056, "step": 1185, "train_runtime": 20213.7952, "train_tokens_per_second": 508.913 }, { "epoch": 0.5142332415059688, "grad_norm": 0.05587423965334892, "learning_rate": 0.00017281135292767565, "loss": 0.0756, "num_input_tokens_seen": 10330336, "step": 1190, "train_runtime": 20234.9959, "train_tokens_per_second": 510.518 }, { "epoch": 0.5163938853778426, "grad_norm": 0.06663983315229416, "learning_rate": 0.00017257323075909359, "loss": 0.0722, "num_input_tokens_seen": 10373616, "step": 1195, "train_runtime": 20256.6032, "train_tokens_per_second": 512.11 }, { "epoch": 0.5185545292497165, "grad_norm": 0.05629422143101692, "learning_rate": 0.00017233423598214635, "loss": 0.0753, "num_input_tokens_seen": 10417200, "step": 1200, "train_runtime": 20278.7181, "train_tokens_per_second": 513.701 }, { "epoch": 0.5185545292497165, "eval_loss": 0.07407065480947495, "eval_runtime": 713.8125, "eval_samples_per_second": 12.968, "eval_steps_per_second": 6.485, "num_input_tokens_seen": 10417200, "step": 1200 }, { "epoch": 0.5207151731215902, "grad_norm": 0.06868501752614975, "learning_rate": 0.00017209437147046715, "loss": 0.0685, "num_input_tokens_seen": 10460224, "step": 1205, "train_runtime": 21016.8491, "train_tokens_per_second": 497.707 }, { "epoch": 0.5228758169934641, "grad_norm": 0.06577759236097336, "learning_rate": 0.0001718536401081466, "loss": 0.0736, "num_input_tokens_seen": 10503504, "step": 1210, "train_runtime": 21038.6716, "train_tokens_per_second": 499.247 }, { "epoch": 0.5250364608653378, "grad_norm": 0.0667291060090065, "learning_rate": 0.00017161204478969837, "loss": 0.0704, "num_input_tokens_seen": 10546496, "step": 1215, "train_runtime": 21060.9395, "train_tokens_per_second": 500.761 }, { "epoch": 0.5271971047372117, "grad_norm": 0.05542680621147156, "learning_rate": 0.00017136958842002401, "loss": 0.0646, "num_input_tokens_seen": 10589392, "step": 1220, "train_runtime": 21082.1205, "train_tokens_per_second": 502.293 }, { "epoch": 0.5293577486090855, "grad_norm": 0.06269074976444244, "learning_rate": 0.00017112627391437828, "loss": 0.0705, "num_input_tokens_seen": 10632368, "step": 1225, "train_runtime": 21104.5922, "train_tokens_per_second": 503.794 }, { "epoch": 0.5315183924809593, "grad_norm": 0.07169587910175323, "learning_rate": 0.00017088210419833404, "loss": 0.0753, "num_input_tokens_seen": 10675296, "step": 1230, "train_runtime": 21125.5563, "train_tokens_per_second": 505.326 }, { "epoch": 0.5336790363528331, "grad_norm": 0.08142554759979248, "learning_rate": 0.00017063708220774702, "loss": 0.0739, "num_input_tokens_seen": 10719072, "step": 1235, "train_runtime": 21147.4293, "train_tokens_per_second": 506.874 }, { "epoch": 0.535839680224707, "grad_norm": 0.05577947571873665, "learning_rate": 0.00017039121088872062, "loss": 0.0811, "num_input_tokens_seen": 10762544, "step": 1240, "train_runtime": 21169.3825, "train_tokens_per_second": 508.401 }, { "epoch": 0.5380003240965808, "grad_norm": 0.07115308195352554, "learning_rate": 0.0001701444931975703, "loss": 0.0838, "num_input_tokens_seen": 10806464, "step": 1245, "train_runtime": 21190.8204, "train_tokens_per_second": 509.96 }, { "epoch": 0.5401609679684546, "grad_norm": 0.0784154161810875, "learning_rate": 0.00016989693210078835, "loss": 0.077, "num_input_tokens_seen": 10849680, "step": 1250, "train_runtime": 21212.6054, "train_tokens_per_second": 511.473 }, { "epoch": 0.5423216118403285, "grad_norm": 0.06607118248939514, "learning_rate": 0.00016964853057500778, "loss": 0.0776, "num_input_tokens_seen": 10893376, "step": 1255, "train_runtime": 21234.3551, "train_tokens_per_second": 513.007 }, { "epoch": 0.5444822557122022, "grad_norm": 0.08232490718364716, "learning_rate": 0.000169399291606967, "loss": 0.0856, "num_input_tokens_seen": 10937072, "step": 1260, "train_runtime": 21256.5851, "train_tokens_per_second": 514.526 }, { "epoch": 0.5466428995840761, "grad_norm": 0.05634531006217003, "learning_rate": 0.00016914921819347355, "loss": 0.0675, "num_input_tokens_seen": 10979984, "step": 1265, "train_runtime": 21277.7493, "train_tokens_per_second": 516.031 }, { "epoch": 0.5488035434559498, "grad_norm": 0.06430606544017792, "learning_rate": 0.00016889831334136827, "loss": 0.081, "num_input_tokens_seen": 11023376, "step": 1270, "train_runtime": 21299.595, "train_tokens_per_second": 517.539 }, { "epoch": 0.5509641873278237, "grad_norm": 0.07790251821279526, "learning_rate": 0.00016864658006748905, "loss": 0.081, "num_input_tokens_seen": 11066864, "step": 1275, "train_runtime": 21321.0734, "train_tokens_per_second": 519.058 }, { "epoch": 0.5531248311996975, "grad_norm": 0.05007950961589813, "learning_rate": 0.00016839402139863461, "loss": 0.0757, "num_input_tokens_seen": 11109904, "step": 1280, "train_runtime": 21342.7154, "train_tokens_per_second": 520.548 }, { "epoch": 0.5552854750715713, "grad_norm": 0.06686703860759735, "learning_rate": 0.00016814064037152805, "loss": 0.0697, "num_input_tokens_seen": 11153008, "step": 1285, "train_runtime": 21363.8744, "train_tokens_per_second": 522.05 }, { "epoch": 0.5574461189434452, "grad_norm": 0.05480387806892395, "learning_rate": 0.00016788644003278038, "loss": 0.0697, "num_input_tokens_seen": 11196352, "step": 1290, "train_runtime": 21385.9113, "train_tokens_per_second": 523.539 }, { "epoch": 0.559606762815319, "grad_norm": 0.07160132378339767, "learning_rate": 0.00016763142343885384, "loss": 0.0703, "num_input_tokens_seen": 11239520, "step": 1295, "train_runtime": 21407.2306, "train_tokens_per_second": 525.034 }, { "epoch": 0.5617674066871928, "grad_norm": 0.06048699840903282, "learning_rate": 0.0001673755936560253, "loss": 0.0775, "num_input_tokens_seen": 11283248, "step": 1300, "train_runtime": 21428.6481, "train_tokens_per_second": 526.55 }, { "epoch": 0.5639280505590666, "grad_norm": 0.05485227331519127, "learning_rate": 0.0001671189537603491, "loss": 0.0719, "num_input_tokens_seen": 11326608, "step": 1305, "train_runtime": 21449.9158, "train_tokens_per_second": 528.049 }, { "epoch": 0.5660886944309405, "grad_norm": 0.054880425333976746, "learning_rate": 0.00016686150683762038, "loss": 0.0706, "num_input_tokens_seen": 11369648, "step": 1310, "train_runtime": 21471.9256, "train_tokens_per_second": 529.512 }, { "epoch": 0.5682493383028142, "grad_norm": 0.07284388691186905, "learning_rate": 0.00016660325598333783, "loss": 0.0705, "num_input_tokens_seen": 11412624, "step": 1315, "train_runtime": 21493.5433, "train_tokens_per_second": 530.979 }, { "epoch": 0.5704099821746881, "grad_norm": 0.07158586382865906, "learning_rate": 0.00016634420430266644, "loss": 0.0783, "num_input_tokens_seen": 11456256, "step": 1320, "train_runtime": 21515.7322, "train_tokens_per_second": 532.459 }, { "epoch": 0.5725706260465618, "grad_norm": 0.06218944862484932, "learning_rate": 0.00016608435491040016, "loss": 0.07, "num_input_tokens_seen": 11499632, "step": 1325, "train_runtime": 21537.0635, "train_tokens_per_second": 533.946 }, { "epoch": 0.5747312699184357, "grad_norm": 0.06975477933883667, "learning_rate": 0.00016582371093092456, "loss": 0.0701, "num_input_tokens_seen": 11543056, "step": 1330, "train_runtime": 21558.7893, "train_tokens_per_second": 535.422 }, { "epoch": 0.5768919137903095, "grad_norm": 0.07880192995071411, "learning_rate": 0.00016556227549817919, "loss": 0.0778, "num_input_tokens_seen": 11586800, "step": 1335, "train_runtime": 21580.8092, "train_tokens_per_second": 536.903 }, { "epoch": 0.5790525576621833, "grad_norm": 0.06973356753587723, "learning_rate": 0.00016530005175561987, "loss": 0.0646, "num_input_tokens_seen": 11629808, "step": 1340, "train_runtime": 21602.932, "train_tokens_per_second": 538.344 }, { "epoch": 0.5812132015340572, "grad_norm": 0.05525905266404152, "learning_rate": 0.00016503704285618094, "loss": 0.0684, "num_input_tokens_seen": 11673088, "step": 1345, "train_runtime": 21624.3412, "train_tokens_per_second": 539.812 }, { "epoch": 0.583373845405931, "grad_norm": 0.07406817376613617, "learning_rate": 0.00016477325196223732, "loss": 0.0748, "num_input_tokens_seen": 11716752, "step": 1350, "train_runtime": 21646.39, "train_tokens_per_second": 541.28 }, { "epoch": 0.5855344892778048, "grad_norm": 0.06670234352350235, "learning_rate": 0.00016450868224556655, "loss": 0.0778, "num_input_tokens_seen": 11760400, "step": 1355, "train_runtime": 21668.814, "train_tokens_per_second": 542.734 }, { "epoch": 0.5876951331496786, "grad_norm": 0.06407748907804489, "learning_rate": 0.0001642433368873105, "loss": 0.0806, "num_input_tokens_seen": 11803904, "step": 1360, "train_runtime": 21690.4295, "train_tokens_per_second": 544.199 }, { "epoch": 0.5898557770215525, "grad_norm": 0.0650821402668953, "learning_rate": 0.0001639772190779374, "loss": 0.0737, "num_input_tokens_seen": 11846960, "step": 1365, "train_runtime": 21713.0068, "train_tokens_per_second": 545.616 }, { "epoch": 0.5920164208934262, "grad_norm": 0.0700002983212471, "learning_rate": 0.00016371033201720308, "loss": 0.0763, "num_input_tokens_seen": 11891024, "step": 1370, "train_runtime": 21734.9473, "train_tokens_per_second": 547.092 }, { "epoch": 0.5941770647653001, "grad_norm": 0.06870347261428833, "learning_rate": 0.0001634426789141129, "loss": 0.0769, "num_input_tokens_seen": 11934832, "step": 1375, "train_runtime": 21757.2922, "train_tokens_per_second": 548.544 }, { "epoch": 0.5963377086371738, "grad_norm": 0.052877090871334076, "learning_rate": 0.0001631742629868829, "loss": 0.0692, "num_input_tokens_seen": 11978000, "step": 1380, "train_runtime": 21778.7649, "train_tokens_per_second": 549.985 }, { "epoch": 0.5984983525090477, "grad_norm": 0.06286793202161789, "learning_rate": 0.00016290508746290123, "loss": 0.072, "num_input_tokens_seen": 12021552, "step": 1385, "train_runtime": 21800.3728, "train_tokens_per_second": 551.438 }, { "epoch": 0.6006589963809215, "grad_norm": 0.04737339913845062, "learning_rate": 0.00016263515557868923, "loss": 0.0704, "num_input_tokens_seen": 12064832, "step": 1390, "train_runtime": 21823.0417, "train_tokens_per_second": 552.848 }, { "epoch": 0.6028196402527953, "grad_norm": 0.06066849082708359, "learning_rate": 0.0001623644705798627, "loss": 0.0707, "num_input_tokens_seen": 12107952, "step": 1395, "train_runtime": 21844.9979, "train_tokens_per_second": 554.267 }, { "epoch": 0.6049802841246692, "grad_norm": 0.08087003231048584, "learning_rate": 0.0001620930357210927, "loss": 0.0763, "num_input_tokens_seen": 12151680, "step": 1400, "train_runtime": 21867.3505, "train_tokens_per_second": 555.7 }, { "epoch": 0.6049802841246692, "eval_loss": 0.07353422790765762, "eval_runtime": 26942.5423, "eval_samples_per_second": 0.344, "eval_steps_per_second": 0.172, "num_input_tokens_seen": 12151680, "step": 1400 }, { "epoch": 0.607140927996543, "grad_norm": 0.06639593839645386, "learning_rate": 0.00016182085426606646, "loss": 0.0825, "num_input_tokens_seen": 12195536, "step": 1405, "train_runtime": 48834.2475, "train_tokens_per_second": 249.733 }, { "epoch": 0.6093015718684168, "grad_norm": 0.06361619383096695, "learning_rate": 0.0001615479294874482, "loss": 0.0751, "num_input_tokens_seen": 12239248, "step": 1410, "train_runtime": 48856.1569, "train_tokens_per_second": 250.516 }, { "epoch": 0.6114622157402906, "grad_norm": 0.05519590154290199, "learning_rate": 0.0001612742646668397, "loss": 0.0654, "num_input_tokens_seen": 12282320, "step": 1415, "train_runtime": 48877.9032, "train_tokens_per_second": 251.286 }, { "epoch": 0.6136228596121645, "grad_norm": 0.0630306825041771, "learning_rate": 0.0001609998630947409, "loss": 0.071, "num_input_tokens_seen": 12325696, "step": 1420, "train_runtime": 48899.6287, "train_tokens_per_second": 252.061 }, { "epoch": 0.6157835034840382, "grad_norm": 0.07874094694852829, "learning_rate": 0.0001607247280705104, "loss": 0.0754, "num_input_tokens_seen": 12368960, "step": 1425, "train_runtime": 48920.7427, "train_tokens_per_second": 252.837 }, { "epoch": 0.6179441473559121, "grad_norm": 0.07459452748298645, "learning_rate": 0.00016044886290232551, "loss": 0.078, "num_input_tokens_seen": 12412576, "step": 1430, "train_runtime": 48942.9145, "train_tokens_per_second": 253.613 }, { "epoch": 0.6201047912277858, "grad_norm": 0.05790963023900986, "learning_rate": 0.0001601722709071429, "loss": 0.0715, "num_input_tokens_seen": 12456080, "step": 1435, "train_runtime": 48964.6715, "train_tokens_per_second": 254.389 }, { "epoch": 0.6222654350996597, "grad_norm": 0.05693833902478218, "learning_rate": 0.00015989495541065825, "loss": 0.0675, "num_input_tokens_seen": 12499104, "step": 1440, "train_runtime": 48986.6876, "train_tokens_per_second": 255.153 }, { "epoch": 0.6244260789715336, "grad_norm": 0.06325607001781464, "learning_rate": 0.0001596169197472667, "loss": 0.0756, "num_input_tokens_seen": 12542528, "step": 1445, "train_runtime": 49007.8533, "train_tokens_per_second": 255.929 }, { "epoch": 0.6265867228434073, "grad_norm": 0.06776595860719681, "learning_rate": 0.00015933816726002245, "loss": 0.0733, "num_input_tokens_seen": 12585680, "step": 1450, "train_runtime": 49030.8771, "train_tokens_per_second": 256.689 }, { "epoch": 0.6287473667152812, "grad_norm": 0.0681847482919693, "learning_rate": 0.0001590587013005987, "loss": 0.0825, "num_input_tokens_seen": 12629408, "step": 1455, "train_runtime": 49052.8605, "train_tokens_per_second": 257.465 }, { "epoch": 0.630908010587155, "grad_norm": 0.07597000896930695, "learning_rate": 0.00015877852522924732, "loss": 0.0824, "num_input_tokens_seen": 12673216, "step": 1460, "train_runtime": 49074.2647, "train_tokens_per_second": 258.246 }, { "epoch": 0.6330686544590288, "grad_norm": 0.0672716274857521, "learning_rate": 0.00015849764241475844, "loss": 0.0668, "num_input_tokens_seen": 12716384, "step": 1465, "train_runtime": 49096.5146, "train_tokens_per_second": 259.008 }, { "epoch": 0.6352292983309026, "grad_norm": 0.08129267394542694, "learning_rate": 0.00015821605623441993, "loss": 0.0781, "num_input_tokens_seen": 12760208, "step": 1470, "train_runtime": 49118.6675, "train_tokens_per_second": 259.783 }, { "epoch": 0.6373899422027764, "grad_norm": 0.1420648843050003, "learning_rate": 0.00015793377007397683, "loss": 0.0762, "num_input_tokens_seen": 12803552, "step": 1475, "train_runtime": 49139.971, "train_tokens_per_second": 260.553 }, { "epoch": 0.6395505860746502, "grad_norm": 0.08139633387327194, "learning_rate": 0.00015765078732759067, "loss": 0.0861, "num_input_tokens_seen": 12847792, "step": 1480, "train_runtime": 49162.4487, "train_tokens_per_second": 261.333 }, { "epoch": 0.6417112299465241, "grad_norm": 0.05912632867693901, "learning_rate": 0.00015736711139779856, "loss": 0.0706, "num_input_tokens_seen": 12891120, "step": 1485, "train_runtime": 49184.1068, "train_tokens_per_second": 262.099 }, { "epoch": 0.6438718738183978, "grad_norm": 0.0663304552435875, "learning_rate": 0.00015708274569547231, "loss": 0.077, "num_input_tokens_seen": 12934784, "step": 1490, "train_runtime": 49205.9826, "train_tokens_per_second": 262.87 }, { "epoch": 0.6460325176902717, "grad_norm": 0.04382750019431114, "learning_rate": 0.00015679769363977753, "loss": 0.0644, "num_input_tokens_seen": 12977696, "step": 1495, "train_runtime": 49226.9806, "train_tokens_per_second": 263.63 }, { "epoch": 0.6481931615621456, "grad_norm": 0.06222411245107651, "learning_rate": 0.00015651195865813234, "loss": 0.0721, "num_input_tokens_seen": 13020880, "step": 1500, "train_runtime": 49248.1972, "train_tokens_per_second": 264.393 }, { "epoch": 0.6503538054340193, "grad_norm": 0.05253620073199272, "learning_rate": 0.00015622554418616625, "loss": 0.0747, "num_input_tokens_seen": 13064176, "step": 1505, "train_runtime": 49269.4676, "train_tokens_per_second": 265.158 }, { "epoch": 0.6525144493058932, "grad_norm": 0.07142533361911774, "learning_rate": 0.0001559384536676789, "loss": 0.074, "num_input_tokens_seen": 13107616, "step": 1510, "train_runtime": 49291.1457, "train_tokens_per_second": 265.922 }, { "epoch": 0.654675093177767, "grad_norm": 0.06417880207300186, "learning_rate": 0.00015565069055459855, "loss": 0.0772, "num_input_tokens_seen": 13151296, "step": 1515, "train_runtime": 49313.0519, "train_tokens_per_second": 266.69 }, { "epoch": 0.6568357370496408, "grad_norm": 0.07250814884901047, "learning_rate": 0.00015536225830694068, "loss": 0.0677, "num_input_tokens_seen": 13194240, "step": 1520, "train_runtime": 49334.5357, "train_tokens_per_second": 267.444 }, { "epoch": 0.6589963809215146, "grad_norm": 0.07462477684020996, "learning_rate": 0.0001550731603927663, "loss": 0.0689, "num_input_tokens_seen": 13237296, "step": 1525, "train_runtime": 49356.2727, "train_tokens_per_second": 268.199 }, { "epoch": 0.6611570247933884, "grad_norm": 0.07405927777290344, "learning_rate": 0.00015478340028814028, "loss": 0.0765, "num_input_tokens_seen": 13280976, "step": 1530, "train_runtime": 49377.9371, "train_tokens_per_second": 268.966 }, { "epoch": 0.6633176686652622, "grad_norm": 0.0729064792394638, "learning_rate": 0.00015449298147708954, "loss": 0.0754, "num_input_tokens_seen": 13324480, "step": 1535, "train_runtime": 49400.043, "train_tokens_per_second": 269.726 }, { "epoch": 0.6654783125371361, "grad_norm": 0.07033967971801758, "learning_rate": 0.00015420190745156126, "loss": 0.0744, "num_input_tokens_seen": 13367904, "step": 1540, "train_runtime": 49421.5579, "train_tokens_per_second": 270.487 }, { "epoch": 0.6676389564090098, "grad_norm": 0.0658600926399231, "learning_rate": 0.0001539101817113807, "loss": 0.0763, "num_input_tokens_seen": 13411344, "step": 1545, "train_runtime": 49443.895, "train_tokens_per_second": 271.244 }, { "epoch": 0.6697996002808837, "grad_norm": 0.05582299083471298, "learning_rate": 0.00015361780776420924, "loss": 0.0774, "num_input_tokens_seen": 13455088, "step": 1550, "train_runtime": 49466.0961, "train_tokens_per_second": 272.006 }, { "epoch": 0.6719602441527576, "grad_norm": 0.07074993848800659, "learning_rate": 0.00015332478912550229, "loss": 0.0773, "num_input_tokens_seen": 13498336, "step": 1555, "train_runtime": 49487.8812, "train_tokens_per_second": 272.76 }, { "epoch": 0.6741208880246313, "grad_norm": 0.052721716463565826, "learning_rate": 0.0001530311293184668, "loss": 0.0701, "num_input_tokens_seen": 13541392, "step": 1560, "train_runtime": 49508.932, "train_tokens_per_second": 273.514 }, { "epoch": 0.6762815318965052, "grad_norm": 0.07522527128458023, "learning_rate": 0.00015273683187401913, "loss": 0.0825, "num_input_tokens_seen": 13585072, "step": 1565, "train_runtime": 49530.1248, "train_tokens_per_second": 274.279 }, { "epoch": 0.678442175768379, "grad_norm": 0.05335766449570656, "learning_rate": 0.00015244190033074243, "loss": 0.0747, "num_input_tokens_seen": 13628304, "step": 1570, "train_runtime": 49551.7374, "train_tokens_per_second": 275.032 }, { "epoch": 0.6806028196402528, "grad_norm": 0.055294234305620193, "learning_rate": 0.0001521463382348441, "loss": 0.071, "num_input_tokens_seen": 13671536, "step": 1575, "train_runtime": 49572.8424, "train_tokens_per_second": 275.787 }, { "epoch": 0.6827634635121266, "grad_norm": 0.0787506178021431, "learning_rate": 0.0001518501491401133, "loss": 0.0742, "num_input_tokens_seen": 13715280, "step": 1580, "train_runtime": 49595.5944, "train_tokens_per_second": 276.542 }, { "epoch": 0.6849241073840004, "grad_norm": 0.08133631199598312, "learning_rate": 0.00015155333660787806, "loss": 0.0815, "num_input_tokens_seen": 13759312, "step": 1585, "train_runtime": 49617.1912, "train_tokens_per_second": 277.309 }, { "epoch": 0.6870847512558742, "grad_norm": 0.06821410357952118, "learning_rate": 0.00015125590420696257, "loss": 0.0665, "num_input_tokens_seen": 13802112, "step": 1590, "train_runtime": 49638.3396, "train_tokens_per_second": 278.053 }, { "epoch": 0.6892453951277481, "grad_norm": 0.059675756841897964, "learning_rate": 0.00015095785551364412, "loss": 0.0696, "num_input_tokens_seen": 13845200, "step": 1595, "train_runtime": 49660.2073, "train_tokens_per_second": 278.799 }, { "epoch": 0.6914060389996219, "grad_norm": 0.05362169072031975, "learning_rate": 0.00015065919411161026, "loss": 0.0732, "num_input_tokens_seen": 13888800, "step": 1600, "train_runtime": 49681.8846, "train_tokens_per_second": 279.555 }, { "epoch": 0.6914060389996219, "eval_loss": 0.07299761474132538, "eval_runtime": 2467.271, "eval_samples_per_second": 3.752, "eval_steps_per_second": 1.876, "num_input_tokens_seen": 13888800, "step": 1600 }, { "epoch": 0.6935666828714957, "grad_norm": 0.05830957740545273, "learning_rate": 0.00015035992359191568, "loss": 0.0665, "num_input_tokens_seen": 13931968, "step": 1605, "train_runtime": 52175.1561, "train_tokens_per_second": 267.023 }, { "epoch": 0.6957273267433696, "grad_norm": 0.06323828548192978, "learning_rate": 0.00015006004755293886, "loss": 0.0732, "num_input_tokens_seen": 13975296, "step": 1610, "train_runtime": 52196.8339, "train_tokens_per_second": 267.742 }, { "epoch": 0.6978879706152433, "grad_norm": 0.0689457580447197, "learning_rate": 0.00014975956960033913, "loss": 0.0769, "num_input_tokens_seen": 14018608, "step": 1615, "train_runtime": 52218.6012, "train_tokens_per_second": 268.46 }, { "epoch": 0.7000486144871172, "grad_norm": 0.07999309152364731, "learning_rate": 0.00014945849334701308, "loss": 0.0759, "num_input_tokens_seen": 14062144, "step": 1620, "train_runtime": 52240.9771, "train_tokens_per_second": 269.178 }, { "epoch": 0.702209258358991, "grad_norm": 0.06296563893556595, "learning_rate": 0.000149156822413051, "loss": 0.0796, "num_input_tokens_seen": 14105600, "step": 1625, "train_runtime": 52262.9578, "train_tokens_per_second": 269.897 }, { "epoch": 0.7043699022308648, "grad_norm": 0.06787339597940445, "learning_rate": 0.00014885456042569372, "loss": 0.0772, "num_input_tokens_seen": 14149056, "step": 1630, "train_runtime": 52284.4218, "train_tokens_per_second": 270.617 }, { "epoch": 0.7065305461027386, "grad_norm": 0.06111348420381546, "learning_rate": 0.00014855171101928872, "loss": 0.077, "num_input_tokens_seen": 14192800, "step": 1635, "train_runtime": 52306.1416, "train_tokens_per_second": 271.341 }, { "epoch": 0.7086911899746124, "grad_norm": 0.06680696457624435, "learning_rate": 0.0001482482778352465, "loss": 0.0705, "num_input_tokens_seen": 14236272, "step": 1640, "train_runtime": 52327.9176, "train_tokens_per_second": 272.059 }, { "epoch": 0.7108518338464862, "grad_norm": 0.058288805186748505, "learning_rate": 0.00014794426452199687, "loss": 0.0693, "num_input_tokens_seen": 14279504, "step": 1645, "train_runtime": 52349.7701, "train_tokens_per_second": 272.771 }, { "epoch": 0.7130124777183601, "grad_norm": 0.06921833008527756, "learning_rate": 0.0001476396747349449, "loss": 0.0768, "num_input_tokens_seen": 14323296, "step": 1650, "train_runtime": 52371.3024, "train_tokens_per_second": 273.495 }, { "epoch": 0.7151731215902339, "grad_norm": 0.07161122560501099, "learning_rate": 0.00014733451213642712, "loss": 0.0785, "num_input_tokens_seen": 14367168, "step": 1655, "train_runtime": 52392.8415, "train_tokens_per_second": 274.22 }, { "epoch": 0.7173337654621077, "grad_norm": 0.08010240644216537, "learning_rate": 0.00014702878039566758, "loss": 0.0758, "num_input_tokens_seen": 14410560, "step": 1660, "train_runtime": 52414.3262, "train_tokens_per_second": 274.936 }, { "epoch": 0.7194944093339816, "grad_norm": 0.0588817335665226, "learning_rate": 0.00014672248318873342, "loss": 0.0695, "num_input_tokens_seen": 14453552, "step": 1665, "train_runtime": 52436.6055, "train_tokens_per_second": 275.639 }, { "epoch": 0.7216550532058553, "grad_norm": 0.06551285833120346, "learning_rate": 0.00014641562419849094, "loss": 0.0725, "num_input_tokens_seen": 14497120, "step": 1670, "train_runtime": 52458.3769, "train_tokens_per_second": 276.355 }, { "epoch": 0.7238156970777292, "grad_norm": 0.07580401748418808, "learning_rate": 0.00014610820711456122, "loss": 0.0848, "num_input_tokens_seen": 14540912, "step": 1675, "train_runtime": 52480.8053, "train_tokens_per_second": 277.071 }, { "epoch": 0.725976340949603, "grad_norm": 0.04763949662446976, "learning_rate": 0.0001458002356332758, "loss": 0.0666, "num_input_tokens_seen": 14583920, "step": 1680, "train_runtime": 52502.201, "train_tokens_per_second": 277.777 }, { "epoch": 0.7281369848214768, "grad_norm": 0.0561816431581974, "learning_rate": 0.0001454917134576321, "loss": 0.0683, "num_input_tokens_seen": 14627040, "step": 1685, "train_runtime": 52524.1203, "train_tokens_per_second": 278.482 }, { "epoch": 0.7302976286933506, "grad_norm": 0.0591006763279438, "learning_rate": 0.0001451826442972491, "loss": 0.0721, "num_input_tokens_seen": 14670560, "step": 1690, "train_runtime": 52546.3285, "train_tokens_per_second": 279.193 }, { "epoch": 0.7324582725652244, "grad_norm": 0.05632052198052406, "learning_rate": 0.00014487303186832255, "loss": 0.073, "num_input_tokens_seen": 14714208, "step": 1695, "train_runtime": 52567.7942, "train_tokens_per_second": 279.909 }, { "epoch": 0.7346189164370982, "grad_norm": 0.0633561760187149, "learning_rate": 0.00014456287989358048, "loss": 0.0773, "num_input_tokens_seen": 14757776, "step": 1700, "train_runtime": 52589.4678, "train_tokens_per_second": 280.622 }, { "epoch": 0.7367795603089721, "grad_norm": 0.058160725980997086, "learning_rate": 0.0001442521921022382, "loss": 0.0732, "num_input_tokens_seen": 14801440, "step": 1705, "train_runtime": 52611.4205, "train_tokens_per_second": 281.335 }, { "epoch": 0.7389402041808459, "grad_norm": 0.0793909877538681, "learning_rate": 0.0001439409722299537, "loss": 0.0794, "num_input_tokens_seen": 14845088, "step": 1710, "train_runtime": 52633.3935, "train_tokens_per_second": 282.047 }, { "epoch": 0.7411008480527197, "grad_norm": 0.06253749877214432, "learning_rate": 0.00014362922401878254, "loss": 0.072, "num_input_tokens_seen": 14888400, "step": 1715, "train_runtime": 52655.0177, "train_tokens_per_second": 282.754 }, { "epoch": 0.7432614919245936, "grad_norm": 0.061189230531454086, "learning_rate": 0.00014331695121713297, "loss": 0.0697, "num_input_tokens_seen": 14931328, "step": 1720, "train_runtime": 52676.0686, "train_tokens_per_second": 283.456 }, { "epoch": 0.7454221357964673, "grad_norm": 0.07376944273710251, "learning_rate": 0.0001430041575797208, "loss": 0.0718, "num_input_tokens_seen": 14974592, "step": 1725, "train_runtime": 52697.2214, "train_tokens_per_second": 284.163 }, { "epoch": 0.7475827796683412, "grad_norm": 0.07209795713424683, "learning_rate": 0.00014269084686752435, "loss": 0.0724, "num_input_tokens_seen": 15017824, "step": 1730, "train_runtime": 52719.2751, "train_tokens_per_second": 284.864 }, { "epoch": 0.749743423540215, "grad_norm": 0.05107741057872772, "learning_rate": 0.00014237702284773914, "loss": 0.0699, "num_input_tokens_seen": 15060864, "step": 1735, "train_runtime": 52741.7167, "train_tokens_per_second": 285.559 }, { "epoch": 0.7519040674120888, "grad_norm": 0.081186942756176, "learning_rate": 0.00014206268929373256, "loss": 0.0757, "num_input_tokens_seen": 15104000, "step": 1740, "train_runtime": 52763.6249, "train_tokens_per_second": 286.258 }, { "epoch": 0.7540647112839626, "grad_norm": 0.07930338382720947, "learning_rate": 0.0001417478499849986, "loss": 0.0782, "num_input_tokens_seen": 15147648, "step": 1745, "train_runtime": 52785.4191, "train_tokens_per_second": 286.967 }, { "epoch": 0.7562253551558364, "grad_norm": 0.07188103348016739, "learning_rate": 0.00014143250870711233, "loss": 0.0754, "num_input_tokens_seen": 15190896, "step": 1750, "train_runtime": 52807.0112, "train_tokens_per_second": 287.668 }, { "epoch": 0.7583859990277103, "grad_norm": 0.052755411714315414, "learning_rate": 0.00014111666925168442, "loss": 0.0686, "num_input_tokens_seen": 15233888, "step": 1755, "train_runtime": 52828.0814, "train_tokens_per_second": 288.367 }, { "epoch": 0.7605466428995841, "grad_norm": 0.05128923058509827, "learning_rate": 0.0001408003354163156, "loss": 0.0718, "num_input_tokens_seen": 15276944, "step": 1760, "train_runtime": 52850.1346, "train_tokens_per_second": 289.062 }, { "epoch": 0.7627072867714579, "grad_norm": 0.06151962652802467, "learning_rate": 0.0001404835110045509, "loss": 0.066, "num_input_tokens_seen": 15319904, "step": 1765, "train_runtime": 52872.1065, "train_tokens_per_second": 289.754 }, { "epoch": 0.7648679306433317, "grad_norm": 0.0742822214961052, "learning_rate": 0.0001401661998258339, "loss": 0.0756, "num_input_tokens_seen": 15363488, "step": 1770, "train_runtime": 52893.7032, "train_tokens_per_second": 290.46 }, { "epoch": 0.7670285745152056, "grad_norm": 0.0559610053896904, "learning_rate": 0.0001398484056954611, "loss": 0.0674, "num_input_tokens_seen": 15406704, "step": 1775, "train_runtime": 52916.0361, "train_tokens_per_second": 291.154 }, { "epoch": 0.7691892183870793, "grad_norm": 0.07098235189914703, "learning_rate": 0.00013953013243453582, "loss": 0.0744, "num_input_tokens_seen": 15450144, "step": 1780, "train_runtime": 52937.3355, "train_tokens_per_second": 291.857 }, { "epoch": 0.7713498622589532, "grad_norm": 0.06474477797746658, "learning_rate": 0.00013921138386992243, "loss": 0.0724, "num_input_tokens_seen": 15493568, "step": 1785, "train_runtime": 52959.2595, "train_tokens_per_second": 292.556 }, { "epoch": 0.773510506130827, "grad_norm": 0.06821322441101074, "learning_rate": 0.0001388921638342003, "loss": 0.0817, "num_input_tokens_seen": 15537664, "step": 1790, "train_runtime": 52980.746, "train_tokens_per_second": 293.27 }, { "epoch": 0.7756711500027008, "grad_norm": 0.07596802711486816, "learning_rate": 0.00013857247616561757, "loss": 0.0782, "num_input_tokens_seen": 15581200, "step": 1795, "train_runtime": 53003.4224, "train_tokens_per_second": 293.966 }, { "epoch": 0.7778317938745746, "grad_norm": 0.0654403567314148, "learning_rate": 0.00013825232470804523, "loss": 0.0732, "num_input_tokens_seen": 15624848, "step": 1800, "train_runtime": 53026.1216, "train_tokens_per_second": 294.663 }, { "epoch": 0.7778317938745746, "eval_loss": 0.07238650321960449, "eval_runtime": 712.4182, "eval_samples_per_second": 12.994, "eval_steps_per_second": 6.498, "num_input_tokens_seen": 15624848, "step": 1800 }, { "epoch": 0.7799924377464484, "grad_norm": 0.06928804516792297, "learning_rate": 0.00013793171331093077, "loss": 0.0793, "num_input_tokens_seen": 15668624, "step": 1805, "train_runtime": 53766.0817, "train_tokens_per_second": 291.422 }, { "epoch": 0.7821530816183223, "grad_norm": 0.06405510008335114, "learning_rate": 0.0001376106458292519, "loss": 0.0709, "num_input_tokens_seen": 15711872, "step": 1810, "train_runtime": 53788.3061, "train_tokens_per_second": 292.106 }, { "epoch": 0.7843137254901961, "grad_norm": 0.05507315695285797, "learning_rate": 0.00013728912612347017, "loss": 0.0745, "num_input_tokens_seen": 15755216, "step": 1815, "train_runtime": 53810.5879, "train_tokens_per_second": 292.79 }, { "epoch": 0.7864743693620699, "grad_norm": 0.05629369989037514, "learning_rate": 0.00013696715805948474, "loss": 0.0735, "num_input_tokens_seen": 15798480, "step": 1820, "train_runtime": 53831.7178, "train_tokens_per_second": 293.479 }, { "epoch": 0.7886350132339437, "grad_norm": 0.060665566474199295, "learning_rate": 0.00013664474550858553, "loss": 0.0651, "num_input_tokens_seen": 15841664, "step": 1825, "train_runtime": 53853.2255, "train_tokens_per_second": 294.164 }, { "epoch": 0.7907956571058176, "grad_norm": 0.05001268535852432, "learning_rate": 0.00013632189234740713, "loss": 0.0768, "num_input_tokens_seen": 15885376, "step": 1830, "train_runtime": 53874.7714, "train_tokens_per_second": 294.857 }, { "epoch": 0.7929563009776913, "grad_norm": 0.06804929673671722, "learning_rate": 0.00013599860245788178, "loss": 0.0761, "num_input_tokens_seen": 15929120, "step": 1835, "train_runtime": 53897.3682, "train_tokens_per_second": 295.545 }, { "epoch": 0.7951169448495652, "grad_norm": 0.04843413084745407, "learning_rate": 0.00013567487972719305, "loss": 0.0633, "num_input_tokens_seen": 15971568, "step": 1840, "train_runtime": 53919.5564, "train_tokens_per_second": 296.211 }, { "epoch": 0.797277588721439, "grad_norm": 0.06037944182753563, "learning_rate": 0.00013535072804772864, "loss": 0.0824, "num_input_tokens_seen": 16015520, "step": 1845, "train_runtime": 53941.2394, "train_tokens_per_second": 296.907 }, { "epoch": 0.7994382325933128, "grad_norm": 0.06481627374887466, "learning_rate": 0.00013502615131703413, "loss": 0.0695, "num_input_tokens_seen": 16058864, "step": 1850, "train_runtime": 53962.9055, "train_tokens_per_second": 297.591 }, { "epoch": 0.8015988764651866, "grad_norm": 0.06719739735126495, "learning_rate": 0.0001347011534377657, "loss": 0.0645, "num_input_tokens_seen": 16101680, "step": 1855, "train_runtime": 53984.6086, "train_tokens_per_second": 298.264 }, { "epoch": 0.8037595203370604, "grad_norm": 0.06208725646138191, "learning_rate": 0.00013437573831764343, "loss": 0.0812, "num_input_tokens_seen": 16145040, "step": 1860, "train_runtime": 54006.3057, "train_tokens_per_second": 298.947 }, { "epoch": 0.8059201642089343, "grad_norm": 0.06299016624689102, "learning_rate": 0.00013404990986940412, "loss": 0.0722, "num_input_tokens_seen": 16188416, "step": 1865, "train_runtime": 54028.3257, "train_tokens_per_second": 299.628 }, { "epoch": 0.8080808080808081, "grad_norm": 0.07144487649202347, "learning_rate": 0.00013372367201075453, "loss": 0.0614, "num_input_tokens_seen": 16230864, "step": 1870, "train_runtime": 54049.739, "train_tokens_per_second": 300.295 }, { "epoch": 0.8102414519526819, "grad_norm": 0.06753461062908173, "learning_rate": 0.00013339702866432392, "loss": 0.0695, "num_input_tokens_seen": 16274048, "step": 1875, "train_runtime": 54071.0694, "train_tokens_per_second": 300.975 }, { "epoch": 0.8124020958245557, "grad_norm": 0.0707787573337555, "learning_rate": 0.00013306998375761718, "loss": 0.0772, "num_input_tokens_seen": 16317760, "step": 1880, "train_runtime": 54092.53, "train_tokens_per_second": 301.664 }, { "epoch": 0.8145627396964296, "grad_norm": 0.07154600322246552, "learning_rate": 0.00013274254122296747, "loss": 0.0765, "num_input_tokens_seen": 16361440, "step": 1885, "train_runtime": 54113.9916, "train_tokens_per_second": 302.351 }, { "epoch": 0.8167233835683033, "grad_norm": 0.07142435014247894, "learning_rate": 0.00013241470499748893, "loss": 0.071, "num_input_tokens_seen": 16404896, "step": 1890, "train_runtime": 54135.2913, "train_tokens_per_second": 303.035 }, { "epoch": 0.8188840274401772, "grad_norm": 0.05690345913171768, "learning_rate": 0.00013208647902302945, "loss": 0.0692, "num_input_tokens_seen": 16448064, "step": 1895, "train_runtime": 54156.9443, "train_tokens_per_second": 303.711 }, { "epoch": 0.8210446713120509, "grad_norm": 0.06509065628051758, "learning_rate": 0.00013175786724612307, "loss": 0.0728, "num_input_tokens_seen": 16491408, "step": 1900, "train_runtime": 54178.256, "train_tokens_per_second": 304.392 }, { "epoch": 0.8232053151839248, "grad_norm": 0.0677073523402214, "learning_rate": 0.00013142887361794277, "loss": 0.0739, "num_input_tokens_seen": 16535056, "step": 1905, "train_runtime": 54199.8692, "train_tokens_per_second": 305.076 }, { "epoch": 0.8253659590557987, "grad_norm": 0.07482102513313293, "learning_rate": 0.00013109950209425284, "loss": 0.0731, "num_input_tokens_seen": 16578592, "step": 1910, "train_runtime": 54221.4112, "train_tokens_per_second": 305.757 }, { "epoch": 0.8275266029276724, "grad_norm": 0.058642659336328506, "learning_rate": 0.00013076975663536123, "loss": 0.072, "num_input_tokens_seen": 16622016, "step": 1915, "train_runtime": 54242.9455, "train_tokens_per_second": 306.436 }, { "epoch": 0.8296872467995463, "grad_norm": 0.10473670810461044, "learning_rate": 0.0001304396412060721, "loss": 0.0711, "num_input_tokens_seen": 16665008, "step": 1920, "train_runtime": 54264.7218, "train_tokens_per_second": 307.106 }, { "epoch": 0.8318478906714201, "grad_norm": 0.060704197734594345, "learning_rate": 0.00013010915977563803, "loss": 0.0677, "num_input_tokens_seen": 16707968, "step": 1925, "train_runtime": 54287.3702, "train_tokens_per_second": 307.769 }, { "epoch": 0.8340085345432939, "grad_norm": 0.06165318936109543, "learning_rate": 0.00012977831631771238, "loss": 0.0709, "num_input_tokens_seen": 16751296, "step": 1930, "train_runtime": 54308.64, "train_tokens_per_second": 308.446 }, { "epoch": 0.8361691784151677, "grad_norm": 0.05098670348525047, "learning_rate": 0.00012944711481030144, "loss": 0.0662, "num_input_tokens_seen": 16794128, "step": 1935, "train_runtime": 54330.2306, "train_tokens_per_second": 309.112 }, { "epoch": 0.8383298222870416, "grad_norm": 0.05525912716984749, "learning_rate": 0.00012911555923571667, "loss": 0.0733, "num_input_tokens_seen": 16837568, "step": 1940, "train_runtime": 54352.8277, "train_tokens_per_second": 309.783 }, { "epoch": 0.8404904661589153, "grad_norm": 0.06591261923313141, "learning_rate": 0.0001287836535805267, "loss": 0.0677, "num_input_tokens_seen": 16880768, "step": 1945, "train_runtime": 54374.0425, "train_tokens_per_second": 310.456 }, { "epoch": 0.8426511100307892, "grad_norm": 0.07755870372056961, "learning_rate": 0.00012845140183550952, "loss": 0.073, "num_input_tokens_seen": 16923904, "step": 1950, "train_runtime": 54395.7961, "train_tokens_per_second": 311.125 }, { "epoch": 0.8448117539026629, "grad_norm": 0.06816552579402924, "learning_rate": 0.00012811880799560443, "loss": 0.0734, "num_input_tokens_seen": 16967392, "step": 1955, "train_runtime": 54418.1536, "train_tokens_per_second": 311.797 }, { "epoch": 0.8469723977745368, "grad_norm": 0.07150571793317795, "learning_rate": 0.00012778587605986403, "loss": 0.0673, "num_input_tokens_seen": 17010560, "step": 1960, "train_runtime": 54441.1468, "train_tokens_per_second": 312.458 }, { "epoch": 0.8491330416464107, "grad_norm": 0.05127614736557007, "learning_rate": 0.0001274526100314061, "loss": 0.0626, "num_input_tokens_seen": 17053200, "step": 1965, "train_runtime": 54462.2921, "train_tokens_per_second": 313.119 }, { "epoch": 0.8512936855182844, "grad_norm": 0.06928465515375137, "learning_rate": 0.00012711901391736555, "loss": 0.0696, "num_input_tokens_seen": 17096224, "step": 1970, "train_runtime": 54483.9566, "train_tokens_per_second": 313.785 }, { "epoch": 0.8534543293901583, "grad_norm": 0.06846630573272705, "learning_rate": 0.00012678509172884617, "loss": 0.0774, "num_input_tokens_seen": 17139760, "step": 1975, "train_runtime": 54506.5848, "train_tokens_per_second": 314.453 }, { "epoch": 0.8556149732620321, "grad_norm": 0.062459319829940796, "learning_rate": 0.00012645084748087236, "loss": 0.0702, "num_input_tokens_seen": 17183136, "step": 1980, "train_runtime": 54528.9268, "train_tokens_per_second": 315.12 }, { "epoch": 0.8577756171339059, "grad_norm": 0.0698806494474411, "learning_rate": 0.00012611628519234094, "loss": 0.0707, "num_input_tokens_seen": 17226272, "step": 1985, "train_runtime": 54550.8877, "train_tokens_per_second": 315.784 }, { "epoch": 0.8599362610057797, "grad_norm": 0.06589354574680328, "learning_rate": 0.00012578140888597284, "loss": 0.0709, "num_input_tokens_seen": 17269712, "step": 1990, "train_runtime": 54572.7325, "train_tokens_per_second": 316.453 }, { "epoch": 0.8620969048776536, "grad_norm": 0.06981069594621658, "learning_rate": 0.00012544622258826464, "loss": 0.0736, "num_input_tokens_seen": 17312816, "step": 1995, "train_runtime": 54594.5932, "train_tokens_per_second": 317.116 }, { "epoch": 0.8642575487495273, "grad_norm": 0.05967501550912857, "learning_rate": 0.00012511073032944018, "loss": 0.0716, "num_input_tokens_seen": 17356192, "step": 2000, "train_runtime": 54617.9513, "train_tokens_per_second": 317.774 }, { "epoch": 0.8642575487495273, "eval_loss": 0.07185881584882736, "eval_runtime": 8674.4485, "eval_samples_per_second": 1.067, "eval_steps_per_second": 0.534, "num_input_tokens_seen": 17356192, "step": 2000 }, { "epoch": 0.8664181926214012, "grad_norm": 0.03572811186313629, "learning_rate": 0.0001247749361434022, "loss": 0.0743, "num_input_tokens_seen": 17399696, "step": 2005, "train_runtime": 63316.239, "train_tokens_per_second": 274.806 }, { "epoch": 0.8685788364932749, "grad_norm": 0.06791722029447556, "learning_rate": 0.00012443884406768368, "loss": 0.0736, "num_input_tokens_seen": 17443488, "step": 2010, "train_runtime": 63337.6458, "train_tokens_per_second": 275.405 }, { "epoch": 0.8707394803651488, "grad_norm": 0.06620905548334122, "learning_rate": 0.00012410245814339948, "loss": 0.0714, "num_input_tokens_seen": 17486960, "step": 2015, "train_runtime": 63359.6078, "train_tokens_per_second": 275.995 }, { "epoch": 0.8729001242370227, "grad_norm": 0.06703072041273117, "learning_rate": 0.0001237657824151975, "loss": 0.0704, "num_input_tokens_seen": 17530256, "step": 2020, "train_runtime": 63381.5042, "train_tokens_per_second": 276.583 }, { "epoch": 0.8750607681088964, "grad_norm": 0.0718189924955368, "learning_rate": 0.0001234288209312104, "loss": 0.0812, "num_input_tokens_seen": 17574496, "step": 2025, "train_runtime": 63403.8123, "train_tokens_per_second": 277.184 }, { "epoch": 0.8772214119807703, "grad_norm": 0.07962594926357269, "learning_rate": 0.0001230915777430065, "loss": 0.0722, "num_input_tokens_seen": 17618192, "step": 2030, "train_runtime": 63426.0337, "train_tokens_per_second": 277.775 }, { "epoch": 0.8793820558526441, "grad_norm": 0.07039056718349457, "learning_rate": 0.00012275405690554135, "loss": 0.0662, "num_input_tokens_seen": 17661008, "step": 2035, "train_runtime": 63448.4618, "train_tokens_per_second": 278.352 }, { "epoch": 0.8815426997245179, "grad_norm": 0.06130144000053406, "learning_rate": 0.00012241626247710906, "loss": 0.0712, "num_input_tokens_seen": 17704320, "step": 2040, "train_runtime": 63470.3468, "train_tokens_per_second": 278.938 }, { "epoch": 0.8837033435963917, "grad_norm": 0.06900149583816528, "learning_rate": 0.00012207819851929315, "loss": 0.0736, "num_input_tokens_seen": 17747808, "step": 2045, "train_runtime": 63492.3122, "train_tokens_per_second": 279.527 }, { "epoch": 0.8858639874682656, "grad_norm": 0.0547107569873333, "learning_rate": 0.00012173986909691799, "loss": 0.0737, "num_input_tokens_seen": 17791120, "step": 2050, "train_runtime": 63514.147, "train_tokens_per_second": 280.113 }, { "epoch": 0.8880246313401393, "grad_norm": 0.07016472518444061, "learning_rate": 0.0001214012782779999, "loss": 0.0714, "num_input_tokens_seen": 17834736, "step": 2055, "train_runtime": 63536.7563, "train_tokens_per_second": 280.7 }, { "epoch": 0.8901852752120132, "grad_norm": 0.060870055109262466, "learning_rate": 0.00012106243013369811, "loss": 0.0676, "num_input_tokens_seen": 17877760, "step": 2060, "train_runtime": 63559.4064, "train_tokens_per_second": 281.276 }, { "epoch": 0.892345919083887, "grad_norm": 0.06822679936885834, "learning_rate": 0.00012072332873826595, "loss": 0.0741, "num_input_tokens_seen": 17921072, "step": 2065, "train_runtime": 63580.9227, "train_tokens_per_second": 281.862 }, { "epoch": 0.8945065629557608, "grad_norm": 0.07840294390916824, "learning_rate": 0.00012038397816900177, "loss": 0.0758, "num_input_tokens_seen": 17964768, "step": 2070, "train_runtime": 63602.8765, "train_tokens_per_second": 282.452 }, { "epoch": 0.8966672068276347, "grad_norm": 0.05988030880689621, "learning_rate": 0.00012004438250619991, "loss": 0.0713, "num_input_tokens_seen": 18008112, "step": 2075, "train_runtime": 63624.4842, "train_tokens_per_second": 283.037 }, { "epoch": 0.8988278506995084, "grad_norm": 0.05252789333462715, "learning_rate": 0.0001197045458331018, "loss": 0.075, "num_input_tokens_seen": 18051376, "step": 2080, "train_runtime": 63645.7848, "train_tokens_per_second": 283.622 }, { "epoch": 0.9009884945713823, "grad_norm": 0.05312652140855789, "learning_rate": 0.00011936447223584657, "loss": 0.0708, "num_input_tokens_seen": 18094832, "step": 2085, "train_runtime": 63667.9044, "train_tokens_per_second": 284.206 }, { "epoch": 0.9031491384432561, "grad_norm": 0.06478448957204819, "learning_rate": 0.00011902416580342221, "loss": 0.0715, "num_input_tokens_seen": 18138112, "step": 2090, "train_runtime": 63689.3636, "train_tokens_per_second": 284.79 }, { "epoch": 0.9053097823151299, "grad_norm": 0.07553625106811523, "learning_rate": 0.00011868363062761621, "loss": 0.0727, "num_input_tokens_seen": 18181984, "step": 2095, "train_runtime": 63711.3409, "train_tokens_per_second": 285.381 }, { "epoch": 0.9074704261870037, "grad_norm": 0.05751855671405792, "learning_rate": 0.00011834287080296644, "loss": 0.0714, "num_input_tokens_seen": 18225232, "step": 2100, "train_runtime": 63733.4595, "train_tokens_per_second": 285.96 }, { "epoch": 0.9096310700588776, "grad_norm": 0.06295677274465561, "learning_rate": 0.00011800189042671198, "loss": 0.0745, "num_input_tokens_seen": 18268656, "step": 2105, "train_runtime": 63755.0208, "train_tokens_per_second": 286.545 }, { "epoch": 0.9117917139307513, "grad_norm": 0.05820206552743912, "learning_rate": 0.0001176606935987437, "loss": 0.0686, "num_input_tokens_seen": 18311760, "step": 2110, "train_runtime": 63776.8833, "train_tokens_per_second": 287.122 }, { "epoch": 0.9139523578026252, "grad_norm": 0.06826373189687729, "learning_rate": 0.00011731928442155508, "loss": 0.0661, "num_input_tokens_seen": 18354832, "step": 2115, "train_runtime": 63798.9017, "train_tokens_per_second": 287.698 }, { "epoch": 0.916113001674499, "grad_norm": 0.07378843426704407, "learning_rate": 0.00011697766700019289, "loss": 0.0793, "num_input_tokens_seen": 18398608, "step": 2120, "train_runtime": 63820.827, "train_tokens_per_second": 288.285 }, { "epoch": 0.9182736455463728, "grad_norm": 0.05657931789755821, "learning_rate": 0.0001166358454422077, "loss": 0.0736, "num_input_tokens_seen": 18442160, "step": 2125, "train_runtime": 63843.125, "train_tokens_per_second": 288.867 }, { "epoch": 0.9204342894182467, "grad_norm": 0.07037783414125443, "learning_rate": 0.0001162938238576047, "loss": 0.0711, "num_input_tokens_seen": 18485376, "step": 2130, "train_runtime": 63864.611, "train_tokens_per_second": 289.446 }, { "epoch": 0.9225949332901204, "grad_norm": 0.07411188632249832, "learning_rate": 0.00011595160635879407, "loss": 0.0704, "num_input_tokens_seen": 18528976, "step": 2135, "train_runtime": 63887.3619, "train_tokens_per_second": 290.026 }, { "epoch": 0.9247555771619943, "grad_norm": 0.06208517774939537, "learning_rate": 0.00011560919706054167, "loss": 0.067, "num_input_tokens_seen": 18571888, "step": 2140, "train_runtime": 63909.8073, "train_tokens_per_second": 290.595 }, { "epoch": 0.9269162210338681, "grad_norm": 0.07666601240634918, "learning_rate": 0.00011526660007991956, "loss": 0.071, "num_input_tokens_seen": 18615296, "step": 2145, "train_runtime": 63932.0127, "train_tokens_per_second": 291.173 }, { "epoch": 0.9290768649057419, "grad_norm": 0.06127588078379631, "learning_rate": 0.0001149238195362564, "loss": 0.0689, "num_input_tokens_seen": 18658384, "step": 2150, "train_runtime": 63953.3132, "train_tokens_per_second": 291.75 }, { "epoch": 0.9312375087776157, "grad_norm": 0.074986532330513, "learning_rate": 0.000114580859551088, "loss": 0.08, "num_input_tokens_seen": 18702240, "step": 2155, "train_runtime": 63975.8598, "train_tokens_per_second": 292.333 }, { "epoch": 0.9333981526494896, "grad_norm": 0.06027218699455261, "learning_rate": 0.00011423772424810775, "loss": 0.0711, "num_input_tokens_seen": 18745424, "step": 2160, "train_runtime": 63997.5378, "train_tokens_per_second": 292.909 }, { "epoch": 0.9355587965213633, "grad_norm": 0.0647510513663292, "learning_rate": 0.00011389441775311704, "loss": 0.0757, "num_input_tokens_seen": 18788960, "step": 2165, "train_runtime": 64019.1462, "train_tokens_per_second": 293.49 }, { "epoch": 0.9377194403932372, "grad_norm": 0.053142938762903214, "learning_rate": 0.00011355094419397563, "loss": 0.0673, "num_input_tokens_seen": 18832304, "step": 2170, "train_runtime": 64040.9072, "train_tokens_per_second": 294.067 }, { "epoch": 0.939880084265111, "grad_norm": 0.07196088880300522, "learning_rate": 0.00011320730770055204, "loss": 0.0764, "num_input_tokens_seen": 18876144, "step": 2175, "train_runtime": 64063.2079, "train_tokens_per_second": 294.649 }, { "epoch": 0.9420407281369848, "grad_norm": 0.06955163925886154, "learning_rate": 0.00011286351240467387, "loss": 0.0775, "num_input_tokens_seen": 18920064, "step": 2180, "train_runtime": 64084.8585, "train_tokens_per_second": 295.235 }, { "epoch": 0.9442013720088587, "grad_norm": 0.08103214204311371, "learning_rate": 0.00011251956244007819, "loss": 0.0766, "num_input_tokens_seen": 18963888, "step": 2185, "train_runtime": 64106.7078, "train_tokens_per_second": 295.818 }, { "epoch": 0.9463620158807324, "grad_norm": 0.06878823786973953, "learning_rate": 0.0001121754619423617, "loss": 0.0731, "num_input_tokens_seen": 19007312, "step": 2190, "train_runtime": 64128.2154, "train_tokens_per_second": 296.395 }, { "epoch": 0.9485226597526063, "grad_norm": 0.05954969301819801, "learning_rate": 0.00011183121504893108, "loss": 0.0783, "num_input_tokens_seen": 19050768, "step": 2195, "train_runtime": 64150.9571, "train_tokens_per_second": 296.968 }, { "epoch": 0.9506833036244801, "grad_norm": 0.04972570016980171, "learning_rate": 0.00011148682589895339, "loss": 0.0689, "num_input_tokens_seen": 19094480, "step": 2200, "train_runtime": 64173.1029, "train_tokens_per_second": 297.546 }, { "epoch": 0.9506833036244801, "eval_loss": 0.07115475833415985, "eval_runtime": 716.0686, "eval_samples_per_second": 12.928, "eval_steps_per_second": 6.464, "num_input_tokens_seen": 19094480, "step": 2200 }, { "epoch": 0.9528439474963539, "grad_norm": 0.051516707986593246, "learning_rate": 0.00011114229863330596, "loss": 0.0686, "num_input_tokens_seen": 19137488, "step": 2205, "train_runtime": 64915.0379, "train_tokens_per_second": 294.808 }, { "epoch": 0.9550045913682277, "grad_norm": 0.06157120689749718, "learning_rate": 0.00011079763739452696, "loss": 0.0687, "num_input_tokens_seen": 19180768, "step": 2210, "train_runtime": 64936.8917, "train_tokens_per_second": 295.376 }, { "epoch": 0.9571652352401016, "grad_norm": 0.07093177735805511, "learning_rate": 0.00011045284632676536, "loss": 0.0791, "num_input_tokens_seen": 19224560, "step": 2215, "train_runtime": 64958.8453, "train_tokens_per_second": 295.95 }, { "epoch": 0.9593258791119754, "grad_norm": 0.08016602694988251, "learning_rate": 0.00011010792957573115, "loss": 0.0719, "num_input_tokens_seen": 19267696, "step": 2220, "train_runtime": 64981.2868, "train_tokens_per_second": 296.511 }, { "epoch": 0.9614865229838492, "grad_norm": 0.06384464353322983, "learning_rate": 0.00010976289128864556, "loss": 0.0698, "num_input_tokens_seen": 19311152, "step": 2225, "train_runtime": 65002.6752, "train_tokens_per_second": 297.082 }, { "epoch": 0.963647166855723, "grad_norm": 0.06991935521364212, "learning_rate": 0.00010941773561419117, "loss": 0.0764, "num_input_tokens_seen": 19354544, "step": 2230, "train_runtime": 65024.4873, "train_tokens_per_second": 297.65 }, { "epoch": 0.9658078107275968, "grad_norm": 0.06561180204153061, "learning_rate": 0.00010907246670246194, "loss": 0.0678, "num_input_tokens_seen": 19397824, "step": 2235, "train_runtime": 65047.126, "train_tokens_per_second": 298.212 }, { "epoch": 0.9679684545994707, "grad_norm": 0.0609147846698761, "learning_rate": 0.00010872708870491337, "loss": 0.07, "num_input_tokens_seen": 19441136, "step": 2240, "train_runtime": 65069.6388, "train_tokens_per_second": 298.774 }, { "epoch": 0.9701290984713444, "grad_norm": 0.049989230930805206, "learning_rate": 0.00010838160577431269, "loss": 0.0709, "num_input_tokens_seen": 19484528, "step": 2245, "train_runtime": 65092.5491, "train_tokens_per_second": 299.336 }, { "epoch": 0.9722897423432183, "grad_norm": 0.05758450925350189, "learning_rate": 0.0001080360220646887, "loss": 0.0738, "num_input_tokens_seen": 19527920, "step": 2250, "train_runtime": 65114.2821, "train_tokens_per_second": 299.902 }, { "epoch": 0.9744503862150921, "grad_norm": 0.06077814847230911, "learning_rate": 0.00010769034173128207, "loss": 0.0837, "num_input_tokens_seen": 19571984, "step": 2255, "train_runtime": 65136.2202, "train_tokens_per_second": 300.478 }, { "epoch": 0.9766110300869659, "grad_norm": 0.05624840408563614, "learning_rate": 0.00010734456893049514, "loss": 0.0733, "num_input_tokens_seen": 19615088, "step": 2260, "train_runtime": 65158.2319, "train_tokens_per_second": 301.038 }, { "epoch": 0.9787716739588397, "grad_norm": 0.08435752242803574, "learning_rate": 0.00010699870781984218, "loss": 0.0674, "num_input_tokens_seen": 19658224, "step": 2265, "train_runtime": 65180.7736, "train_tokens_per_second": 301.595 }, { "epoch": 0.9809323178307136, "grad_norm": 0.06772467494010925, "learning_rate": 0.00010665276255789923, "loss": 0.0609, "num_input_tokens_seen": 19700912, "step": 2270, "train_runtime": 65201.7743, "train_tokens_per_second": 302.153 }, { "epoch": 0.9830929617025874, "grad_norm": 0.06909680366516113, "learning_rate": 0.00010630673730425412, "loss": 0.0692, "num_input_tokens_seen": 19744352, "step": 2275, "train_runtime": 65223.8375, "train_tokens_per_second": 302.717 }, { "epoch": 0.9852536055744612, "grad_norm": 0.06532754749059677, "learning_rate": 0.0001059606362194565, "loss": 0.0675, "num_input_tokens_seen": 19787488, "step": 2280, "train_runtime": 65245.6541, "train_tokens_per_second": 303.277 }, { "epoch": 0.987414249446335, "grad_norm": 0.06435127556324005, "learning_rate": 0.00010561446346496786, "loss": 0.0713, "num_input_tokens_seen": 19830608, "step": 2285, "train_runtime": 65267.3727, "train_tokens_per_second": 303.836 }, { "epoch": 0.9895748933182088, "grad_norm": 0.0690252035856247, "learning_rate": 0.00010526822320311136, "loss": 0.0785, "num_input_tokens_seen": 19873984, "step": 2290, "train_runtime": 65289.8265, "train_tokens_per_second": 304.396 }, { "epoch": 0.9917355371900827, "grad_norm": 0.06570211052894592, "learning_rate": 0.00010492191959702187, "loss": 0.0677, "num_input_tokens_seen": 19917008, "step": 2295, "train_runtime": 65312.2643, "train_tokens_per_second": 304.951 }, { "epoch": 0.9938961810619564, "grad_norm": 0.06227204203605652, "learning_rate": 0.00010457555681059597, "loss": 0.0683, "num_input_tokens_seen": 19960096, "step": 2300, "train_runtime": 65334.0717, "train_tokens_per_second": 305.508 }, { "epoch": 0.9960568249338303, "grad_norm": 0.08121524751186371, "learning_rate": 0.00010422913900844169, "loss": 0.0766, "num_input_tokens_seen": 20004080, "step": 2305, "train_runtime": 65356.4613, "train_tokens_per_second": 306.077 }, { "epoch": 0.9982174688057041, "grad_norm": 0.07001639157533646, "learning_rate": 0.0001038826703558287, "loss": 0.0694, "num_input_tokens_seen": 20047552, "step": 2310, "train_runtime": 65378.934, "train_tokens_per_second": 306.636 }, { "epoch": 1.0, "grad_norm": 0.2362603098154068, "learning_rate": 0.00010353615501863799, "loss": 0.0615, "num_input_tokens_seen": 20082296, "step": 2315, "train_runtime": 65397.7722, "train_tokens_per_second": 307.079 }, { "epoch": 1.0021606438718738, "grad_norm": 0.07865723967552185, "learning_rate": 0.00010318959716331191, "loss": 0.0746, "num_input_tokens_seen": 20126264, "step": 2320, "train_runtime": 65419.76, "train_tokens_per_second": 307.648 }, { "epoch": 1.0043212877437477, "grad_norm": 0.061586473137140274, "learning_rate": 0.00010284300095680403, "loss": 0.0648, "num_input_tokens_seen": 20169208, "step": 2325, "train_runtime": 65440.9527, "train_tokens_per_second": 308.205 }, { "epoch": 1.0064819316156215, "grad_norm": 0.06647315621376038, "learning_rate": 0.00010249637056652906, "loss": 0.064, "num_input_tokens_seen": 20212504, "step": 2330, "train_runtime": 65462.4612, "train_tokens_per_second": 308.765 }, { "epoch": 1.0086425754874953, "grad_norm": 0.0650409534573555, "learning_rate": 0.00010214971016031274, "loss": 0.0675, "num_input_tokens_seen": 20255400, "step": 2335, "train_runtime": 65484.7168, "train_tokens_per_second": 309.315 }, { "epoch": 1.010803219359369, "grad_norm": 0.06883776932954788, "learning_rate": 0.00010180302390634168, "loss": 0.0637, "num_input_tokens_seen": 20298760, "step": 2340, "train_runtime": 65506.2982, "train_tokens_per_second": 309.875 }, { "epoch": 1.012963863231243, "grad_norm": 0.06613084673881531, "learning_rate": 0.00010145631597311334, "loss": 0.0703, "num_input_tokens_seen": 20342456, "step": 2345, "train_runtime": 65528.323, "train_tokens_per_second": 310.438 }, { "epoch": 1.0151245071031167, "grad_norm": 0.08105847239494324, "learning_rate": 0.00010110959052938575, "loss": 0.07, "num_input_tokens_seen": 20386088, "step": 2350, "train_runtime": 65550.3814, "train_tokens_per_second": 310.999 }, { "epoch": 1.0172851509749905, "grad_norm": 0.06776931136846542, "learning_rate": 0.00010076285174412759, "loss": 0.0673, "num_input_tokens_seen": 20429368, "step": 2355, "train_runtime": 65571.624, "train_tokens_per_second": 311.558 }, { "epoch": 1.0194457948468643, "grad_norm": 0.08375387638807297, "learning_rate": 0.00010041610378646789, "loss": 0.0653, "num_input_tokens_seen": 20472888, "step": 2360, "train_runtime": 65593.4934, "train_tokens_per_second": 312.118 }, { "epoch": 1.0216064387187382, "grad_norm": 0.061812516301870346, "learning_rate": 0.00010006935082564599, "loss": 0.0702, "num_input_tokens_seen": 20516760, "step": 2365, "train_runtime": 65615.2348, "train_tokens_per_second": 312.683 }, { "epoch": 1.023767082590612, "grad_norm": 0.061185307800769806, "learning_rate": 9.97225970309614e-05, "loss": 0.0676, "num_input_tokens_seen": 20560472, "step": 2370, "train_runtime": 65637.3028, "train_tokens_per_second": 313.244 }, { "epoch": 1.0259277264624858, "grad_norm": 0.06244645267724991, "learning_rate": 9.937584657172361e-05, "loss": 0.0696, "num_input_tokens_seen": 20604024, "step": 2375, "train_runtime": 65659.1613, "train_tokens_per_second": 313.803 }, { "epoch": 1.0280883703343597, "grad_norm": 0.07663462311029434, "learning_rate": 9.902910361720203e-05, "loss": 0.0688, "num_input_tokens_seen": 20647528, "step": 2380, "train_runtime": 65681.0633, "train_tokens_per_second": 314.36 }, { "epoch": 1.0302490142062335, "grad_norm": 0.07399339973926544, "learning_rate": 9.868237233657588e-05, "loss": 0.0638, "num_input_tokens_seen": 20690680, "step": 2385, "train_runtime": 65702.4149, "train_tokens_per_second": 314.915 }, { "epoch": 1.0324096580781073, "grad_norm": 0.050758518278598785, "learning_rate": 9.833565689888395e-05, "loss": 0.0567, "num_input_tokens_seen": 20733304, "step": 2390, "train_runtime": 65723.6939, "train_tokens_per_second": 315.462 }, { "epoch": 1.034570301949981, "grad_norm": 0.06294345110654831, "learning_rate": 9.798896147297457e-05, "loss": 0.071, "num_input_tokens_seen": 20776664, "step": 2395, "train_runtime": 65745.5546, "train_tokens_per_second": 316.016 }, { "epoch": 1.036730945821855, "grad_norm": 0.08008322864770889, "learning_rate": 9.764229022745543e-05, "loss": 0.0686, "num_input_tokens_seen": 20820072, "step": 2400, "train_runtime": 65767.2053, "train_tokens_per_second": 316.572 }, { "epoch": 1.036730945821855, "eval_loss": 0.07095986604690552, "eval_runtime": 31720.4675, "eval_samples_per_second": 0.292, "eval_steps_per_second": 0.146, "num_input_tokens_seen": 20820072, "step": 2400 }, { "epoch": 1.0388915896937287, "grad_norm": 0.06503720581531525, "learning_rate": 9.72956473306435e-05, "loss": 0.0666, "num_input_tokens_seen": 20863896, "step": 2405, "train_runtime": 97511.5446, "train_tokens_per_second": 213.963 }, { "epoch": 1.0410522335656025, "grad_norm": 0.070051409304142, "learning_rate": 9.694903695051488e-05, "loss": 0.0648, "num_input_tokens_seen": 20907080, "step": 2410, "train_runtime": 97533.7562, "train_tokens_per_second": 214.357 }, { "epoch": 1.0432128774374765, "grad_norm": 0.05461447313427925, "learning_rate": 9.660246325465471e-05, "loss": 0.0664, "num_input_tokens_seen": 20950152, "step": 2415, "train_runtime": 97554.9301, "train_tokens_per_second": 214.752 }, { "epoch": 1.0453735213093502, "grad_norm": 0.07258272171020508, "learning_rate": 9.625593041020701e-05, "loss": 0.0643, "num_input_tokens_seen": 20993624, "step": 2420, "train_runtime": 97576.2055, "train_tokens_per_second": 215.151 }, { "epoch": 1.047534165181224, "grad_norm": 0.07304082065820694, "learning_rate": 9.590944258382466e-05, "loss": 0.065, "num_input_tokens_seen": 21036984, "step": 2425, "train_runtime": 97597.7384, "train_tokens_per_second": 215.548 }, { "epoch": 1.0496948090530978, "grad_norm": 0.08018536120653152, "learning_rate": 9.556300394161919e-05, "loss": 0.0753, "num_input_tokens_seen": 21080616, "step": 2430, "train_runtime": 97619.4508, "train_tokens_per_second": 215.947 }, { "epoch": 1.0518554529249717, "grad_norm": 0.06374417245388031, "learning_rate": 9.52166186491108e-05, "loss": 0.067, "num_input_tokens_seen": 21124120, "step": 2435, "train_runtime": 97641.5898, "train_tokens_per_second": 216.343 }, { "epoch": 1.0540160967968455, "grad_norm": 0.056240521371364594, "learning_rate": 9.48702908711782e-05, "loss": 0.0604, "num_input_tokens_seen": 21167000, "step": 2440, "train_runtime": 97663.0543, "train_tokens_per_second": 216.735 }, { "epoch": 1.0561767406687192, "grad_norm": 0.08230195939540863, "learning_rate": 9.452402477200852e-05, "loss": 0.0645, "num_input_tokens_seen": 21210232, "step": 2445, "train_runtime": 97685.1274, "train_tokens_per_second": 217.129 }, { "epoch": 1.058337384540593, "grad_norm": 0.060752853751182556, "learning_rate": 9.417782451504737e-05, "loss": 0.0681, "num_input_tokens_seen": 21253656, "step": 2450, "train_runtime": 97706.5559, "train_tokens_per_second": 217.525 }, { "epoch": 1.060498028412467, "grad_norm": 0.07154619693756104, "learning_rate": 9.383169426294861e-05, "loss": 0.0686, "num_input_tokens_seen": 21296840, "step": 2455, "train_runtime": 97728.2395, "train_tokens_per_second": 217.919 }, { "epoch": 1.0626586722843407, "grad_norm": 0.06834197789430618, "learning_rate": 9.348563817752437e-05, "loss": 0.0645, "num_input_tokens_seen": 21340056, "step": 2460, "train_runtime": 97750.0436, "train_tokens_per_second": 218.312 }, { "epoch": 1.0648193161562145, "grad_norm": 0.07614444941282272, "learning_rate": 9.313966041969501e-05, "loss": 0.0695, "num_input_tokens_seen": 21383464, "step": 2465, "train_runtime": 97772.6309, "train_tokens_per_second": 218.706 }, { "epoch": 1.0669799600280885, "grad_norm": 0.06275052577257156, "learning_rate": 9.279376514943915e-05, "loss": 0.0711, "num_input_tokens_seen": 21426760, "step": 2470, "train_runtime": 97794.7669, "train_tokens_per_second": 219.099 }, { "epoch": 1.0691406038999622, "grad_norm": 0.061219897121191025, "learning_rate": 9.244795652574354e-05, "loss": 0.0659, "num_input_tokens_seen": 21470104, "step": 2475, "train_runtime": 97816.8904, "train_tokens_per_second": 219.493 }, { "epoch": 1.071301247771836, "grad_norm": 0.06351316720247269, "learning_rate": 9.210223870655312e-05, "loss": 0.0635, "num_input_tokens_seen": 21513112, "step": 2480, "train_runtime": 97837.7716, "train_tokens_per_second": 219.886 }, { "epoch": 1.0734618916437098, "grad_norm": 0.06057807803153992, "learning_rate": 9.175661584872103e-05, "loss": 0.0681, "num_input_tokens_seen": 21556600, "step": 2485, "train_runtime": 97859.0562, "train_tokens_per_second": 220.282 }, { "epoch": 1.0756225355155837, "grad_norm": 0.06607088446617126, "learning_rate": 9.141109210795859e-05, "loss": 0.0631, "num_input_tokens_seen": 21599688, "step": 2490, "train_runtime": 97880.2066, "train_tokens_per_second": 220.675 }, { "epoch": 1.0777831793874575, "grad_norm": 0.08129261434078217, "learning_rate": 9.106567163878533e-05, "loss": 0.0681, "num_input_tokens_seen": 21642872, "step": 2495, "train_runtime": 97901.951, "train_tokens_per_second": 221.067 }, { "epoch": 1.0799438232593312, "grad_norm": 0.06941742449998856, "learning_rate": 9.072035859447913e-05, "loss": 0.0688, "num_input_tokens_seen": 21686264, "step": 2500, "train_runtime": 97923.7723, "train_tokens_per_second": 221.461 }, { "epoch": 1.082104467131205, "grad_norm": 0.06604190915822983, "learning_rate": 9.037515712702613e-05, "loss": 0.0741, "num_input_tokens_seen": 21729544, "step": 2505, "train_runtime": 97945.6763, "train_tokens_per_second": 221.853 }, { "epoch": 1.084265111003079, "grad_norm": 0.0760832205414772, "learning_rate": 9.003007138707095e-05, "loss": 0.0697, "num_input_tokens_seen": 21773240, "step": 2510, "train_runtime": 97968.2869, "train_tokens_per_second": 222.248 }, { "epoch": 1.0864257548749527, "grad_norm": 0.06299443542957306, "learning_rate": 8.968510552386668e-05, "loss": 0.0663, "num_input_tokens_seen": 21816552, "step": 2515, "train_runtime": 97989.9696, "train_tokens_per_second": 222.641 }, { "epoch": 1.0885863987468265, "grad_norm": 0.062478598207235336, "learning_rate": 8.93402636852251e-05, "loss": 0.0662, "num_input_tokens_seen": 21860056, "step": 2520, "train_runtime": 98011.1485, "train_tokens_per_second": 223.036 }, { "epoch": 1.0907470426187005, "grad_norm": 0.08520319312810898, "learning_rate": 8.899555001746664e-05, "loss": 0.0755, "num_input_tokens_seen": 21903896, "step": 2525, "train_runtime": 98033.6589, "train_tokens_per_second": 223.432 }, { "epoch": 1.0929076864905742, "grad_norm": 0.060366444289684296, "learning_rate": 8.865096866537071e-05, "loss": 0.0676, "num_input_tokens_seen": 21947240, "step": 2530, "train_runtime": 98055.8604, "train_tokens_per_second": 223.824 }, { "epoch": 1.095068330362448, "grad_norm": 0.06165176257491112, "learning_rate": 8.83065237721257e-05, "loss": 0.0676, "num_input_tokens_seen": 21990456, "step": 2535, "train_runtime": 98077.877, "train_tokens_per_second": 224.214 }, { "epoch": 1.0972289742343218, "grad_norm": 0.06571424752473831, "learning_rate": 8.796221947927932e-05, "loss": 0.0681, "num_input_tokens_seen": 22033624, "step": 2540, "train_runtime": 98099.4736, "train_tokens_per_second": 224.605 }, { "epoch": 1.0993896181061957, "grad_norm": 0.06270553916692734, "learning_rate": 8.761805992668869e-05, "loss": 0.0628, "num_input_tokens_seen": 22076664, "step": 2545, "train_runtime": 98121.768, "train_tokens_per_second": 224.993 }, { "epoch": 1.1015502619780695, "grad_norm": 0.0668216124176979, "learning_rate": 8.727404925247058e-05, "loss": 0.0643, "num_input_tokens_seen": 22119784, "step": 2550, "train_runtime": 98142.8207, "train_tokens_per_second": 225.384 }, { "epoch": 1.1037109058499432, "grad_norm": 0.07844366133213043, "learning_rate": 8.693019159295176e-05, "loss": 0.0682, "num_input_tokens_seen": 22163224, "step": 2555, "train_runtime": 98164.731, "train_tokens_per_second": 225.776 }, { "epoch": 1.105871549721817, "grad_norm": 0.05678049847483635, "learning_rate": 8.658649108261899e-05, "loss": 0.066, "num_input_tokens_seen": 22206840, "step": 2560, "train_runtime": 98186.7714, "train_tokens_per_second": 226.169 }, { "epoch": 1.108032193593691, "grad_norm": 0.07098106294870377, "learning_rate": 8.624295185406964e-05, "loss": 0.0658, "num_input_tokens_seen": 22250264, "step": 2565, "train_runtime": 98208.9343, "train_tokens_per_second": 226.56 }, { "epoch": 1.1101928374655647, "grad_norm": 0.07217643409967422, "learning_rate": 8.589957803796187e-05, "loss": 0.0639, "num_input_tokens_seen": 22293448, "step": 2570, "train_runtime": 98230.9853, "train_tokens_per_second": 226.949 }, { "epoch": 1.1123534813374385, "grad_norm": 0.06756918132305145, "learning_rate": 8.555637376296489e-05, "loss": 0.0631, "num_input_tokens_seen": 22336632, "step": 2575, "train_runtime": 98252.5038, "train_tokens_per_second": 227.339 }, { "epoch": 1.1145141252093125, "grad_norm": 0.05753394216299057, "learning_rate": 8.521334315570939e-05, "loss": 0.0635, "num_input_tokens_seen": 22379848, "step": 2580, "train_runtime": 98274.0744, "train_tokens_per_second": 227.729 }, { "epoch": 1.1166747690811862, "grad_norm": 0.06545528769493103, "learning_rate": 8.48704903407379e-05, "loss": 0.0658, "num_input_tokens_seen": 22423064, "step": 2585, "train_runtime": 98296.1108, "train_tokens_per_second": 228.118 }, { "epoch": 1.11883541295306, "grad_norm": 0.07646580785512924, "learning_rate": 8.45278194404552e-05, "loss": 0.0751, "num_input_tokens_seen": 22466600, "step": 2590, "train_runtime": 98317.5035, "train_tokens_per_second": 228.511 }, { "epoch": 1.1209960568249338, "grad_norm": 0.06202688813209534, "learning_rate": 8.41853345750788e-05, "loss": 0.0626, "num_input_tokens_seen": 22509448, "step": 2595, "train_runtime": 98338.7439, "train_tokens_per_second": 228.897 }, { "epoch": 1.1231567006968077, "grad_norm": 0.05783059075474739, "learning_rate": 8.384303986258932e-05, "loss": 0.0635, "num_input_tokens_seen": 22552760, "step": 2600, "train_runtime": 98360.0854, "train_tokens_per_second": 229.288 }, { "epoch": 1.1231567006968077, "eval_loss": 0.07066521048545837, "eval_runtime": 710.5608, "eval_samples_per_second": 13.028, "eval_steps_per_second": 6.515, "num_input_tokens_seen": 22552760, "step": 2600 } ], "logging_steps": 5, "max_steps": 4630, "num_input_tokens_seen": 22552760, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0270836768097075e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }