diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8332 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 5000, + "global_step": 965749, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001035465736956497, + "grad_norm": 5.295963764190674, + "learning_rate": 9.92e-06, + "loss": 8.4977, + "step": 1000 + }, + { + "epoch": 0.002070931473912994, + "grad_norm": 2.1808838844299316, + "learning_rate": 1.992e-05, + "loss": 5.7562, + "step": 2000 + }, + { + "epoch": 0.003106397210869491, + "grad_norm": 2.5940399169921875, + "learning_rate": 2.9920000000000005e-05, + "loss": 5.1705, + "step": 3000 + }, + { + "epoch": 0.004141862947825988, + "grad_norm": 2.5618865489959717, + "learning_rate": 3.9920000000000004e-05, + "loss": 4.7383, + "step": 4000 + }, + { + "epoch": 0.005177328684782485, + "grad_norm": 2.15096116065979, + "learning_rate": 4.992e-05, + "loss": 4.4163, + "step": 5000 + }, + { + "epoch": 0.005177328684782485, + "eval_loss": 4.266116142272949, + "eval_runtime": 16.5978, + "eval_samples_per_second": 2979.738, + "eval_steps_per_second": 11.688, + "step": 5000 + }, + { + "epoch": 0.006212794421738982, + "grad_norm": 2.0443923473358154, + "learning_rate": 5e-05, + "loss": 4.1981, + "step": 6000 + }, + { + "epoch": 0.007248260158695479, + "grad_norm": 2.379178524017334, + "learning_rate": 5e-05, + "loss": 3.9849, + "step": 7000 + }, + { + "epoch": 0.008283725895651977, + "grad_norm": 2.9000627994537354, + "learning_rate": 5e-05, + "loss": 3.856, + "step": 8000 + }, + { + "epoch": 0.009319191632608474, + "grad_norm": 2.694687604904175, + "learning_rate": 5e-05, + "loss": 3.759, + "step": 9000 + }, + { + "epoch": 0.01035465736956497, + "grad_norm": 1.9065606594085693, + "learning_rate": 5e-05, + "loss": 3.6538, + "step": 10000 + }, + { + "epoch": 0.01035465736956497, + "eval_loss": 3.5857133865356445, + "eval_runtime": 17.7332, + "eval_samples_per_second": 2788.951, + "eval_steps_per_second": 10.94, + "step": 10000 + }, + { + "epoch": 0.011390123106521467, + "grad_norm": 1.7840981483459473, + "learning_rate": 5e-05, + "loss": 3.5709, + "step": 11000 + }, + { + "epoch": 0.012425588843477964, + "grad_norm": 2.0488035678863525, + "learning_rate": 5e-05, + "loss": 3.5079, + "step": 12000 + }, + { + "epoch": 0.013461054580434461, + "grad_norm": 1.8077210187911987, + "learning_rate": 5e-05, + "loss": 3.4421, + "step": 13000 + }, + { + "epoch": 0.014496520317390958, + "grad_norm": 1.925595998764038, + "learning_rate": 5e-05, + "loss": 3.3707, + "step": 14000 + }, + { + "epoch": 0.015531986054347455, + "grad_norm": 2.2106831073760986, + "learning_rate": 5e-05, + "loss": 3.3201, + "step": 15000 + }, + { + "epoch": 0.015531986054347455, + "eval_loss": 3.2835636138916016, + "eval_runtime": 24.2836, + "eval_samples_per_second": 2036.641, + "eval_steps_per_second": 7.989, + "step": 15000 + }, + { + "epoch": 0.016567451791303953, + "grad_norm": 2.162048578262329, + "learning_rate": 5e-05, + "loss": 3.2932, + "step": 16000 + }, + { + "epoch": 0.01760291752826045, + "grad_norm": 2.2526283264160156, + "learning_rate": 5e-05, + "loss": 3.2308, + "step": 17000 + }, + { + "epoch": 0.018638383265216947, + "grad_norm": 1.7822258472442627, + "learning_rate": 5e-05, + "loss": 3.2012, + "step": 18000 + }, + { + "epoch": 0.019673849002173442, + "grad_norm": 2.0987908840179443, + "learning_rate": 5e-05, + "loss": 3.1437, + "step": 19000 + }, + { + "epoch": 0.02070931473912994, + "grad_norm": 1.7948940992355347, + "learning_rate": 5e-05, + "loss": 3.1402, + "step": 20000 + }, + { + "epoch": 0.02070931473912994, + "eval_loss": 3.0880234241485596, + "eval_runtime": 16.8033, + "eval_samples_per_second": 2943.298, + "eval_steps_per_second": 11.545, + "step": 20000 + }, + { + "epoch": 0.021744780476086436, + "grad_norm": 1.7310450077056885, + "learning_rate": 5e-05, + "loss": 3.0985, + "step": 21000 + }, + { + "epoch": 0.022780246213042935, + "grad_norm": 1.727274775505066, + "learning_rate": 5e-05, + "loss": 3.0633, + "step": 22000 + }, + { + "epoch": 0.02381571194999943, + "grad_norm": 1.6332778930664062, + "learning_rate": 5e-05, + "loss": 3.0373, + "step": 23000 + }, + { + "epoch": 0.024851177686955928, + "grad_norm": 1.7220966815948486, + "learning_rate": 5e-05, + "loss": 3.0169, + "step": 24000 + }, + { + "epoch": 0.025886643423912423, + "grad_norm": 2.0234944820404053, + "learning_rate": 5e-05, + "loss": 2.9761, + "step": 25000 + }, + { + "epoch": 0.025886643423912423, + "eval_loss": 2.950173854827881, + "eval_runtime": 20.4691, + "eval_samples_per_second": 2416.183, + "eval_steps_per_second": 9.478, + "step": 25000 + }, + { + "epoch": 0.026922109160868922, + "grad_norm": 1.9352872371673584, + "learning_rate": 5e-05, + "loss": 2.9704, + "step": 26000 + }, + { + "epoch": 0.027957574897825417, + "grad_norm": 2.0853705406188965, + "learning_rate": 5e-05, + "loss": 2.9426, + "step": 27000 + }, + { + "epoch": 0.028993040634781916, + "grad_norm": 1.5806540250778198, + "learning_rate": 5e-05, + "loss": 2.9182, + "step": 28000 + }, + { + "epoch": 0.03002850637173841, + "grad_norm": 1.729049801826477, + "learning_rate": 5e-05, + "loss": 2.8912, + "step": 29000 + }, + { + "epoch": 0.03106397210869491, + "grad_norm": 2.1811740398406982, + "learning_rate": 5e-05, + "loss": 2.8908, + "step": 30000 + }, + { + "epoch": 0.03106397210869491, + "eval_loss": 2.8682405948638916, + "eval_runtime": 16.5574, + "eval_samples_per_second": 2987.007, + "eval_steps_per_second": 11.717, + "step": 30000 + }, + { + "epoch": 0.032099437845651405, + "grad_norm": 2.0321109294891357, + "learning_rate": 5e-05, + "loss": 2.8732, + "step": 31000 + }, + { + "epoch": 0.03313490358260791, + "grad_norm": 2.065203905105591, + "learning_rate": 5e-05, + "loss": 2.8576, + "step": 32000 + }, + { + "epoch": 0.0341703693195644, + "grad_norm": 1.7852224111557007, + "learning_rate": 5e-05, + "loss": 2.8378, + "step": 33000 + }, + { + "epoch": 0.0352058350565209, + "grad_norm": 1.6953657865524292, + "learning_rate": 5e-05, + "loss": 2.8113, + "step": 34000 + }, + { + "epoch": 0.03624130079347739, + "grad_norm": 1.9876961708068848, + "learning_rate": 5e-05, + "loss": 2.8135, + "step": 35000 + }, + { + "epoch": 0.03624130079347739, + "eval_loss": 2.7857329845428467, + "eval_runtime": 20.7337, + "eval_samples_per_second": 2385.344, + "eval_steps_per_second": 9.357, + "step": 35000 + }, + { + "epoch": 0.037276766530433894, + "grad_norm": 2.0485334396362305, + "learning_rate": 5e-05, + "loss": 2.8003, + "step": 36000 + }, + { + "epoch": 0.03831223226739039, + "grad_norm": 2.228898048400879, + "learning_rate": 5e-05, + "loss": 2.7635, + "step": 37000 + }, + { + "epoch": 0.039347698004346884, + "grad_norm": 2.039750337600708, + "learning_rate": 5e-05, + "loss": 2.7655, + "step": 38000 + }, + { + "epoch": 0.04038316374130338, + "grad_norm": 1.6063960790634155, + "learning_rate": 5e-05, + "loss": 2.7665, + "step": 39000 + }, + { + "epoch": 0.04141862947825988, + "grad_norm": 1.679189920425415, + "learning_rate": 5e-05, + "loss": 2.7416, + "step": 40000 + }, + { + "epoch": 0.04141862947825988, + "eval_loss": 2.7217838764190674, + "eval_runtime": 17.7772, + "eval_samples_per_second": 2782.047, + "eval_steps_per_second": 10.913, + "step": 40000 + }, + { + "epoch": 0.04245409521521638, + "grad_norm": 2.0102996826171875, + "learning_rate": 5e-05, + "loss": 2.7269, + "step": 41000 + }, + { + "epoch": 0.04348956095217287, + "grad_norm": 1.8129605054855347, + "learning_rate": 5e-05, + "loss": 2.7308, + "step": 42000 + }, + { + "epoch": 0.04452502668912937, + "grad_norm": 1.7243841886520386, + "learning_rate": 5e-05, + "loss": 2.6989, + "step": 43000 + }, + { + "epoch": 0.04556049242608587, + "grad_norm": 1.8720917701721191, + "learning_rate": 5e-05, + "loss": 2.7039, + "step": 44000 + }, + { + "epoch": 0.046595958163042364, + "grad_norm": 1.686995506286621, + "learning_rate": 5e-05, + "loss": 2.6911, + "step": 45000 + }, + { + "epoch": 0.046595958163042364, + "eval_loss": 2.669776678085327, + "eval_runtime": 24.3059, + "eval_samples_per_second": 2034.773, + "eval_steps_per_second": 7.982, + "step": 45000 + }, + { + "epoch": 0.04763142389999886, + "grad_norm": 1.7721649408340454, + "learning_rate": 5e-05, + "loss": 2.6801, + "step": 46000 + }, + { + "epoch": 0.04866688963695536, + "grad_norm": 1.7684073448181152, + "learning_rate": 5e-05, + "loss": 2.6667, + "step": 47000 + }, + { + "epoch": 0.049702355373911856, + "grad_norm": 1.6600712537765503, + "learning_rate": 5e-05, + "loss": 2.6588, + "step": 48000 + }, + { + "epoch": 0.05073782111086835, + "grad_norm": 1.6547819375991821, + "learning_rate": 5e-05, + "loss": 2.6545, + "step": 49000 + }, + { + "epoch": 0.05177328684782485, + "grad_norm": 1.6996227502822876, + "learning_rate": 5e-05, + "loss": 2.6362, + "step": 50000 + }, + { + "epoch": 0.05177328684782485, + "eval_loss": 2.6260855197906494, + "eval_runtime": 18.6918, + "eval_samples_per_second": 2645.921, + "eval_steps_per_second": 10.379, + "step": 50000 + }, + { + "epoch": 0.05280875258478135, + "grad_norm": 2.2706496715545654, + "learning_rate": 5e-05, + "loss": 2.63, + "step": 51000 + }, + { + "epoch": 0.053844218321737844, + "grad_norm": 1.6703062057495117, + "learning_rate": 5e-05, + "loss": 2.6243, + "step": 52000 + }, + { + "epoch": 0.05487968405869434, + "grad_norm": 1.6016428470611572, + "learning_rate": 5e-05, + "loss": 2.6266, + "step": 53000 + }, + { + "epoch": 0.055915149795650834, + "grad_norm": 1.567052960395813, + "learning_rate": 5e-05, + "loss": 2.6115, + "step": 54000 + }, + { + "epoch": 0.056950615532607336, + "grad_norm": 1.7554875612258911, + "learning_rate": 5e-05, + "loss": 2.6094, + "step": 55000 + }, + { + "epoch": 0.056950615532607336, + "eval_loss": 2.5904622077941895, + "eval_runtime": 24.6243, + "eval_samples_per_second": 2008.46, + "eval_steps_per_second": 7.878, + "step": 55000 + }, + { + "epoch": 0.05798608126956383, + "grad_norm": 2.1339058876037598, + "learning_rate": 5e-05, + "loss": 2.6088, + "step": 56000 + }, + { + "epoch": 0.05902154700652033, + "grad_norm": 2.116978406906128, + "learning_rate": 5e-05, + "loss": 2.5887, + "step": 57000 + }, + { + "epoch": 0.06005701274347682, + "grad_norm": 1.9898152351379395, + "learning_rate": 5e-05, + "loss": 2.586, + "step": 58000 + }, + { + "epoch": 0.061092478480433324, + "grad_norm": 1.9398534297943115, + "learning_rate": 5e-05, + "loss": 2.5883, + "step": 59000 + }, + { + "epoch": 0.06212794421738982, + "grad_norm": 1.707964301109314, + "learning_rate": 5e-05, + "loss": 2.5718, + "step": 60000 + }, + { + "epoch": 0.06212794421738982, + "eval_loss": 2.546797513961792, + "eval_runtime": 17.6579, + "eval_samples_per_second": 2800.839, + "eval_steps_per_second": 10.987, + "step": 60000 + }, + { + "epoch": 0.06316340995434631, + "grad_norm": 1.5731209516525269, + "learning_rate": 5e-05, + "loss": 2.5716, + "step": 61000 + }, + { + "epoch": 0.06419887569130281, + "grad_norm": 2.1644086837768555, + "learning_rate": 5e-05, + "loss": 2.5653, + "step": 62000 + }, + { + "epoch": 0.0652343414282593, + "grad_norm": 1.5315316915512085, + "learning_rate": 5e-05, + "loss": 2.5352, + "step": 63000 + }, + { + "epoch": 0.06626980716521581, + "grad_norm": 1.4464577436447144, + "learning_rate": 5e-05, + "loss": 2.5527, + "step": 64000 + }, + { + "epoch": 0.06730527290217231, + "grad_norm": 1.8106459379196167, + "learning_rate": 5e-05, + "loss": 2.5368, + "step": 65000 + }, + { + "epoch": 0.06730527290217231, + "eval_loss": 2.5234298706054688, + "eval_runtime": 20.0466, + "eval_samples_per_second": 2467.096, + "eval_steps_per_second": 9.677, + "step": 65000 + }, + { + "epoch": 0.0683407386391288, + "grad_norm": 1.8276619911193848, + "learning_rate": 5e-05, + "loss": 2.5312, + "step": 66000 + }, + { + "epoch": 0.0693762043760853, + "grad_norm": 1.4809328317642212, + "learning_rate": 5e-05, + "loss": 2.5363, + "step": 67000 + }, + { + "epoch": 0.0704116701130418, + "grad_norm": 1.711206316947937, + "learning_rate": 5e-05, + "loss": 2.527, + "step": 68000 + }, + { + "epoch": 0.07144713584999829, + "grad_norm": 1.7342501878738403, + "learning_rate": 5e-05, + "loss": 2.5039, + "step": 69000 + }, + { + "epoch": 0.07248260158695478, + "grad_norm": 1.5501240491867065, + "learning_rate": 5e-05, + "loss": 2.5066, + "step": 70000 + }, + { + "epoch": 0.07248260158695478, + "eval_loss": 2.489656448364258, + "eval_runtime": 17.4023, + "eval_samples_per_second": 2841.977, + "eval_steps_per_second": 11.148, + "step": 70000 + }, + { + "epoch": 0.07351806732391128, + "grad_norm": 1.970528483390808, + "learning_rate": 5e-05, + "loss": 2.5052, + "step": 71000 + }, + { + "epoch": 0.07455353306086779, + "grad_norm": 1.6995656490325928, + "learning_rate": 5e-05, + "loss": 2.4875, + "step": 72000 + }, + { + "epoch": 0.07558899879782428, + "grad_norm": 1.906000018119812, + "learning_rate": 5e-05, + "loss": 2.4934, + "step": 73000 + }, + { + "epoch": 0.07662446453478078, + "grad_norm": 1.8074511289596558, + "learning_rate": 5e-05, + "loss": 2.494, + "step": 74000 + }, + { + "epoch": 0.07765993027173727, + "grad_norm": 1.6471534967422485, + "learning_rate": 5e-05, + "loss": 2.4962, + "step": 75000 + }, + { + "epoch": 0.07765993027173727, + "eval_loss": 2.462290048599243, + "eval_runtime": 31.7738, + "eval_samples_per_second": 1556.535, + "eval_steps_per_second": 6.106, + "step": 75000 + }, + { + "epoch": 0.07869539600869377, + "grad_norm": 1.7024720907211304, + "learning_rate": 5e-05, + "loss": 2.4639, + "step": 76000 + }, + { + "epoch": 0.07973086174565026, + "grad_norm": 2.064392328262329, + "learning_rate": 5e-05, + "loss": 2.4731, + "step": 77000 + }, + { + "epoch": 0.08076632748260676, + "grad_norm": 2.113358497619629, + "learning_rate": 5e-05, + "loss": 2.4613, + "step": 78000 + }, + { + "epoch": 0.08180179321956327, + "grad_norm": 1.6446911096572876, + "learning_rate": 5e-05, + "loss": 2.4614, + "step": 79000 + }, + { + "epoch": 0.08283725895651976, + "grad_norm": 1.7507164478302002, + "learning_rate": 5e-05, + "loss": 2.4637, + "step": 80000 + }, + { + "epoch": 0.08283725895651976, + "eval_loss": 2.445667266845703, + "eval_runtime": 16.9458, + "eval_samples_per_second": 2918.533, + "eval_steps_per_second": 11.448, + "step": 80000 + }, + { + "epoch": 0.08387272469347626, + "grad_norm": 1.8405325412750244, + "learning_rate": 5e-05, + "loss": 2.4431, + "step": 81000 + }, + { + "epoch": 0.08490819043043275, + "grad_norm": 1.7342174053192139, + "learning_rate": 5e-05, + "loss": 2.4382, + "step": 82000 + }, + { + "epoch": 0.08594365616738925, + "grad_norm": 1.7074140310287476, + "learning_rate": 5e-05, + "loss": 2.446, + "step": 83000 + }, + { + "epoch": 0.08697912190434574, + "grad_norm": 1.7877367734909058, + "learning_rate": 5e-05, + "loss": 2.4437, + "step": 84000 + }, + { + "epoch": 0.08801458764130224, + "grad_norm": 1.3164676427841187, + "learning_rate": 5e-05, + "loss": 2.4365, + "step": 85000 + }, + { + "epoch": 0.08801458764130224, + "eval_loss": 2.418172597885132, + "eval_runtime": 17.7099, + "eval_samples_per_second": 2792.626, + "eval_steps_per_second": 10.954, + "step": 85000 + }, + { + "epoch": 0.08905005337825873, + "grad_norm": 1.9907282590866089, + "learning_rate": 5e-05, + "loss": 2.427, + "step": 86000 + }, + { + "epoch": 0.09008551911521524, + "grad_norm": 1.7687242031097412, + "learning_rate": 5e-05, + "loss": 2.4223, + "step": 87000 + }, + { + "epoch": 0.09112098485217174, + "grad_norm": 1.5485864877700806, + "learning_rate": 5e-05, + "loss": 2.4204, + "step": 88000 + }, + { + "epoch": 0.09215645058912823, + "grad_norm": 1.7300666570663452, + "learning_rate": 5e-05, + "loss": 2.4314, + "step": 89000 + }, + { + "epoch": 0.09319191632608473, + "grad_norm": 2.0416300296783447, + "learning_rate": 5e-05, + "loss": 2.4154, + "step": 90000 + }, + { + "epoch": 0.09319191632608473, + "eval_loss": 2.4060208797454834, + "eval_runtime": 18.224, + "eval_samples_per_second": 2713.838, + "eval_steps_per_second": 10.645, + "step": 90000 + }, + { + "epoch": 0.09422738206304122, + "grad_norm": 1.6988297700881958, + "learning_rate": 5e-05, + "loss": 2.406, + "step": 91000 + }, + { + "epoch": 0.09526284779999772, + "grad_norm": 1.6872934103012085, + "learning_rate": 5e-05, + "loss": 2.407, + "step": 92000 + }, + { + "epoch": 0.09629831353695421, + "grad_norm": 1.7713836431503296, + "learning_rate": 5e-05, + "loss": 2.4032, + "step": 93000 + }, + { + "epoch": 0.09733377927391072, + "grad_norm": 1.6413403749465942, + "learning_rate": 5e-05, + "loss": 2.3996, + "step": 94000 + }, + { + "epoch": 0.09836924501086722, + "grad_norm": 1.9076873064041138, + "learning_rate": 5e-05, + "loss": 2.4004, + "step": 95000 + }, + { + "epoch": 0.09836924501086722, + "eval_loss": 2.380493640899658, + "eval_runtime": 17.1775, + "eval_samples_per_second": 2879.167, + "eval_steps_per_second": 11.294, + "step": 95000 + }, + { + "epoch": 0.09940471074782371, + "grad_norm": 1.6223602294921875, + "learning_rate": 5e-05, + "loss": 2.3839, + "step": 96000 + }, + { + "epoch": 0.10044017648478021, + "grad_norm": 1.9881786108016968, + "learning_rate": 5e-05, + "loss": 2.4024, + "step": 97000 + }, + { + "epoch": 0.1014756422217367, + "grad_norm": 1.5491753816604614, + "learning_rate": 5e-05, + "loss": 2.3796, + "step": 98000 + }, + { + "epoch": 0.1025111079586932, + "grad_norm": 1.735318899154663, + "learning_rate": 5e-05, + "loss": 2.3753, + "step": 99000 + }, + { + "epoch": 0.1035465736956497, + "grad_norm": 1.5919784307479858, + "learning_rate": 5e-05, + "loss": 2.3768, + "step": 100000 + }, + { + "epoch": 0.1035465736956497, + "eval_loss": 2.3675897121429443, + "eval_runtime": 18.2938, + "eval_samples_per_second": 2703.485, + "eval_steps_per_second": 10.605, + "step": 100000 + }, + { + "epoch": 0.10458203943260619, + "grad_norm": 1.3763296604156494, + "learning_rate": 5e-05, + "loss": 2.3749, + "step": 101000 + }, + { + "epoch": 0.1056175051695627, + "grad_norm": 1.8743693828582764, + "learning_rate": 5e-05, + "loss": 2.3675, + "step": 102000 + }, + { + "epoch": 0.10665297090651919, + "grad_norm": 1.7040822505950928, + "learning_rate": 5e-05, + "loss": 2.3778, + "step": 103000 + }, + { + "epoch": 0.10768843664347569, + "grad_norm": 1.355368971824646, + "learning_rate": 5e-05, + "loss": 2.3658, + "step": 104000 + }, + { + "epoch": 0.10872390238043218, + "grad_norm": 1.8316023349761963, + "learning_rate": 5e-05, + "loss": 2.3613, + "step": 105000 + }, + { + "epoch": 0.10872390238043218, + "eval_loss": 2.3513875007629395, + "eval_runtime": 20.1363, + "eval_samples_per_second": 2456.116, + "eval_steps_per_second": 9.634, + "step": 105000 + }, + { + "epoch": 0.10975936811738868, + "grad_norm": 1.6225669384002686, + "learning_rate": 5e-05, + "loss": 2.3666, + "step": 106000 + }, + { + "epoch": 0.11079483385434517, + "grad_norm": 1.6963558197021484, + "learning_rate": 5e-05, + "loss": 2.3507, + "step": 107000 + }, + { + "epoch": 0.11183029959130167, + "grad_norm": 1.7828996181488037, + "learning_rate": 5e-05, + "loss": 2.3457, + "step": 108000 + }, + { + "epoch": 0.11286576532825818, + "grad_norm": 1.6532371044158936, + "learning_rate": 5e-05, + "loss": 2.3637, + "step": 109000 + }, + { + "epoch": 0.11390123106521467, + "grad_norm": 2.0967419147491455, + "learning_rate": 5e-05, + "loss": 2.3392, + "step": 110000 + }, + { + "epoch": 0.11390123106521467, + "eval_loss": 2.337290048599243, + "eval_runtime": 17.1687, + "eval_samples_per_second": 2880.654, + "eval_steps_per_second": 11.3, + "step": 110000 + }, + { + "epoch": 0.11493669680217117, + "grad_norm": 2.1648826599121094, + "learning_rate": 5e-05, + "loss": 2.3377, + "step": 111000 + }, + { + "epoch": 0.11597216253912766, + "grad_norm": 1.6574516296386719, + "learning_rate": 5e-05, + "loss": 2.3364, + "step": 112000 + }, + { + "epoch": 0.11700762827608416, + "grad_norm": 1.8057987689971924, + "learning_rate": 5e-05, + "loss": 2.3341, + "step": 113000 + }, + { + "epoch": 0.11804309401304065, + "grad_norm": 1.8663420677185059, + "learning_rate": 5e-05, + "loss": 2.3392, + "step": 114000 + }, + { + "epoch": 0.11907855974999715, + "grad_norm": 1.688050389289856, + "learning_rate": 5e-05, + "loss": 2.3386, + "step": 115000 + }, + { + "epoch": 0.11907855974999715, + "eval_loss": 2.3217532634735107, + "eval_runtime": 17.6363, + "eval_samples_per_second": 2804.28, + "eval_steps_per_second": 11.0, + "step": 115000 + }, + { + "epoch": 0.12011402548695364, + "grad_norm": 2.507835626602173, + "learning_rate": 5e-05, + "loss": 2.3248, + "step": 116000 + }, + { + "epoch": 0.12114949122391015, + "grad_norm": 1.6291868686676025, + "learning_rate": 5e-05, + "loss": 2.325, + "step": 117000 + }, + { + "epoch": 0.12218495696086665, + "grad_norm": 1.5350573062896729, + "learning_rate": 5e-05, + "loss": 2.3386, + "step": 118000 + }, + { + "epoch": 0.12322042269782314, + "grad_norm": 1.751421332359314, + "learning_rate": 5e-05, + "loss": 2.3197, + "step": 119000 + }, + { + "epoch": 0.12425588843477964, + "grad_norm": 1.9885656833648682, + "learning_rate": 5e-05, + "loss": 2.3221, + "step": 120000 + }, + { + "epoch": 0.12425588843477964, + "eval_loss": 2.2982702255249023, + "eval_runtime": 18.9907, + "eval_samples_per_second": 2604.28, + "eval_steps_per_second": 10.216, + "step": 120000 + }, + { + "epoch": 0.12529135417173615, + "grad_norm": 1.4570821523666382, + "learning_rate": 5e-05, + "loss": 2.3159, + "step": 121000 + }, + { + "epoch": 0.12632681990869263, + "grad_norm": 1.4419127702713013, + "learning_rate": 5e-05, + "loss": 2.3184, + "step": 122000 + }, + { + "epoch": 0.12736228564564914, + "grad_norm": 1.6944835186004639, + "learning_rate": 5e-05, + "loss": 2.3131, + "step": 123000 + }, + { + "epoch": 0.12839775138260562, + "grad_norm": 1.7291606664657593, + "learning_rate": 5e-05, + "loss": 2.3031, + "step": 124000 + }, + { + "epoch": 0.12943321711956213, + "grad_norm": 1.6769369840621948, + "learning_rate": 5e-05, + "loss": 2.2995, + "step": 125000 + }, + { + "epoch": 0.12943321711956213, + "eval_loss": 2.295583724975586, + "eval_runtime": 31.5701, + "eval_samples_per_second": 1566.579, + "eval_steps_per_second": 6.145, + "step": 125000 + }, + { + "epoch": 0.1304686828565186, + "grad_norm": 1.7094770669937134, + "learning_rate": 5e-05, + "loss": 2.303, + "step": 126000 + }, + { + "epoch": 0.13150414859347512, + "grad_norm": 1.5756804943084717, + "learning_rate": 5e-05, + "loss": 2.3048, + "step": 127000 + }, + { + "epoch": 0.13253961433043163, + "grad_norm": 1.6020985841751099, + "learning_rate": 5e-05, + "loss": 2.3037, + "step": 128000 + }, + { + "epoch": 0.1335750800673881, + "grad_norm": 1.2563997507095337, + "learning_rate": 5e-05, + "loss": 2.2906, + "step": 129000 + }, + { + "epoch": 0.13461054580434462, + "grad_norm": 1.8040730953216553, + "learning_rate": 5e-05, + "loss": 2.2813, + "step": 130000 + }, + { + "epoch": 0.13461054580434462, + "eval_loss": 2.2846579551696777, + "eval_runtime": 19.9964, + "eval_samples_per_second": 2473.3, + "eval_steps_per_second": 9.702, + "step": 130000 + }, + { + "epoch": 0.1356460115413011, + "grad_norm": 1.693924903869629, + "learning_rate": 5e-05, + "loss": 2.2897, + "step": 131000 + }, + { + "epoch": 0.1366814772782576, + "grad_norm": 1.8027820587158203, + "learning_rate": 5e-05, + "loss": 2.2875, + "step": 132000 + }, + { + "epoch": 0.1377169430152141, + "grad_norm": 1.7695696353912354, + "learning_rate": 5e-05, + "loss": 2.2833, + "step": 133000 + }, + { + "epoch": 0.1387524087521706, + "grad_norm": 1.5467225313186646, + "learning_rate": 5e-05, + "loss": 2.2945, + "step": 134000 + }, + { + "epoch": 0.13978787448912708, + "grad_norm": 1.5252859592437744, + "learning_rate": 5e-05, + "loss": 2.2698, + "step": 135000 + }, + { + "epoch": 0.13978787448912708, + "eval_loss": 2.271275520324707, + "eval_runtime": 18.9188, + "eval_samples_per_second": 2614.178, + "eval_steps_per_second": 10.254, + "step": 135000 + }, + { + "epoch": 0.1408233402260836, + "grad_norm": 1.7169549465179443, + "learning_rate": 5e-05, + "loss": 2.2983, + "step": 136000 + }, + { + "epoch": 0.1418588059630401, + "grad_norm": 1.4472647905349731, + "learning_rate": 5e-05, + "loss": 2.2863, + "step": 137000 + }, + { + "epoch": 0.14289427169999658, + "grad_norm": 1.9395118951797485, + "learning_rate": 5e-05, + "loss": 2.2764, + "step": 138000 + }, + { + "epoch": 0.1439297374369531, + "grad_norm": 1.5419905185699463, + "learning_rate": 5e-05, + "loss": 2.2753, + "step": 139000 + }, + { + "epoch": 0.14496520317390957, + "grad_norm": 1.6046137809753418, + "learning_rate": 5e-05, + "loss": 2.2895, + "step": 140000 + }, + { + "epoch": 0.14496520317390957, + "eval_loss": 2.2650840282440186, + "eval_runtime": 17.554, + "eval_samples_per_second": 2817.421, + "eval_steps_per_second": 11.052, + "step": 140000 + }, + { + "epoch": 0.14600066891086608, + "grad_norm": 1.755807638168335, + "learning_rate": 5e-05, + "loss": 2.2728, + "step": 141000 + }, + { + "epoch": 0.14703613464782256, + "grad_norm": 2.676787853240967, + "learning_rate": 5e-05, + "loss": 2.2562, + "step": 142000 + }, + { + "epoch": 0.14807160038477907, + "grad_norm": 1.503091812133789, + "learning_rate": 5e-05, + "loss": 2.256, + "step": 143000 + }, + { + "epoch": 0.14910706612173558, + "grad_norm": 1.8047343492507935, + "learning_rate": 5e-05, + "loss": 2.2588, + "step": 144000 + }, + { + "epoch": 0.15014253185869206, + "grad_norm": 1.6858866214752197, + "learning_rate": 5e-05, + "loss": 2.2605, + "step": 145000 + }, + { + "epoch": 0.15014253185869206, + "eval_loss": 2.239438056945801, + "eval_runtime": 30.413, + "eval_samples_per_second": 1626.178, + "eval_steps_per_second": 6.379, + "step": 145000 + }, + { + "epoch": 0.15117799759564857, + "grad_norm": 1.4932698011398315, + "learning_rate": 5e-05, + "loss": 2.261, + "step": 146000 + }, + { + "epoch": 0.15221346333260505, + "grad_norm": 1.627295732498169, + "learning_rate": 5e-05, + "loss": 2.2472, + "step": 147000 + }, + { + "epoch": 0.15324892906956156, + "grad_norm": 1.7859046459197998, + "learning_rate": 5e-05, + "loss": 2.2594, + "step": 148000 + }, + { + "epoch": 0.15428439480651804, + "grad_norm": 1.6636815071105957, + "learning_rate": 5e-05, + "loss": 2.2594, + "step": 149000 + }, + { + "epoch": 0.15531986054347455, + "grad_norm": 2.078432083129883, + "learning_rate": 5e-05, + "loss": 2.256, + "step": 150000 + }, + { + "epoch": 0.15531986054347455, + "eval_loss": 2.2415122985839844, + "eval_runtime": 18.5954, + "eval_samples_per_second": 2659.638, + "eval_steps_per_second": 10.433, + "step": 150000 + }, + { + "epoch": 0.15635532628043106, + "grad_norm": 1.2664598226547241, + "learning_rate": 5e-05, + "loss": 2.2497, + "step": 151000 + }, + { + "epoch": 0.15739079201738754, + "grad_norm": 1.5538097620010376, + "learning_rate": 5e-05, + "loss": 2.2499, + "step": 152000 + }, + { + "epoch": 0.15842625775434405, + "grad_norm": 1.8865635395050049, + "learning_rate": 5e-05, + "loss": 2.2489, + "step": 153000 + }, + { + "epoch": 0.15946172349130053, + "grad_norm": 1.5606614351272583, + "learning_rate": 5e-05, + "loss": 2.2388, + "step": 154000 + }, + { + "epoch": 0.16049718922825704, + "grad_norm": 1.9067413806915283, + "learning_rate": 5e-05, + "loss": 2.24, + "step": 155000 + }, + { + "epoch": 0.16049718922825704, + "eval_loss": 2.2304790019989014, + "eval_runtime": 17.7179, + "eval_samples_per_second": 2791.36, + "eval_steps_per_second": 10.949, + "step": 155000 + }, + { + "epoch": 0.16153265496521352, + "grad_norm": 1.8727903366088867, + "learning_rate": 5e-05, + "loss": 2.2338, + "step": 156000 + }, + { + "epoch": 0.16256812070217003, + "grad_norm": 1.8794825077056885, + "learning_rate": 5e-05, + "loss": 2.2472, + "step": 157000 + }, + { + "epoch": 0.16360358643912654, + "grad_norm": 1.9586312770843506, + "learning_rate": 5e-05, + "loss": 2.2411, + "step": 158000 + }, + { + "epoch": 0.16463905217608302, + "grad_norm": 2.5029993057250977, + "learning_rate": 5e-05, + "loss": 2.245, + "step": 159000 + }, + { + "epoch": 0.16567451791303953, + "grad_norm": 1.5990197658538818, + "learning_rate": 5e-05, + "loss": 2.2239, + "step": 160000 + }, + { + "epoch": 0.16567451791303953, + "eval_loss": 2.219214916229248, + "eval_runtime": 19.7117, + "eval_samples_per_second": 2509.014, + "eval_steps_per_second": 9.842, + "step": 160000 + }, + { + "epoch": 0.166709983649996, + "grad_norm": 2.100264072418213, + "learning_rate": 5e-05, + "loss": 2.2337, + "step": 161000 + }, + { + "epoch": 0.16774544938695252, + "grad_norm": 1.7322697639465332, + "learning_rate": 5e-05, + "loss": 2.2222, + "step": 162000 + }, + { + "epoch": 0.168780915123909, + "grad_norm": 1.6511707305908203, + "learning_rate": 5e-05, + "loss": 2.2287, + "step": 163000 + }, + { + "epoch": 0.1698163808608655, + "grad_norm": 1.5296807289123535, + "learning_rate": 5e-05, + "loss": 2.2269, + "step": 164000 + }, + { + "epoch": 0.170851846597822, + "grad_norm": 1.788960337638855, + "learning_rate": 5e-05, + "loss": 2.2237, + "step": 165000 + }, + { + "epoch": 0.170851846597822, + "eval_loss": 2.2153329849243164, + "eval_runtime": 18.4747, + "eval_samples_per_second": 2677.015, + "eval_steps_per_second": 10.501, + "step": 165000 + }, + { + "epoch": 0.1718873123347785, + "grad_norm": 1.592215657234192, + "learning_rate": 5e-05, + "loss": 2.2246, + "step": 166000 + }, + { + "epoch": 0.172922778071735, + "grad_norm": 1.7051317691802979, + "learning_rate": 5e-05, + "loss": 2.2075, + "step": 167000 + }, + { + "epoch": 0.1739582438086915, + "grad_norm": 1.8867826461791992, + "learning_rate": 5e-05, + "loss": 2.2187, + "step": 168000 + }, + { + "epoch": 0.174993709545648, + "grad_norm": 1.5261858701705933, + "learning_rate": 5e-05, + "loss": 2.2068, + "step": 169000 + }, + { + "epoch": 0.17602917528260448, + "grad_norm": 1.7023552656173706, + "learning_rate": 5e-05, + "loss": 2.231, + "step": 170000 + }, + { + "epoch": 0.17602917528260448, + "eval_loss": 2.200360059738159, + "eval_runtime": 34.4793, + "eval_samples_per_second": 1434.396, + "eval_steps_per_second": 5.627, + "step": 170000 + }, + { + "epoch": 0.177064641019561, + "grad_norm": 1.6329045295715332, + "learning_rate": 5e-05, + "loss": 2.217, + "step": 171000 + }, + { + "epoch": 0.17810010675651747, + "grad_norm": 1.8211392164230347, + "learning_rate": 5e-05, + "loss": 2.2109, + "step": 172000 + }, + { + "epoch": 0.17913557249347398, + "grad_norm": 1.8491740226745605, + "learning_rate": 5e-05, + "loss": 2.21, + "step": 173000 + }, + { + "epoch": 0.18017103823043049, + "grad_norm": 1.5435434579849243, + "learning_rate": 5e-05, + "loss": 2.2026, + "step": 174000 + }, + { + "epoch": 0.18120650396738697, + "grad_norm": 1.8491019010543823, + "learning_rate": 5e-05, + "loss": 2.2101, + "step": 175000 + }, + { + "epoch": 0.18120650396738697, + "eval_loss": 2.1987149715423584, + "eval_runtime": 18.5985, + "eval_samples_per_second": 2659.196, + "eval_steps_per_second": 10.431, + "step": 175000 + }, + { + "epoch": 0.18224196970434348, + "grad_norm": 1.6454964876174927, + "learning_rate": 5e-05, + "loss": 2.2014, + "step": 176000 + }, + { + "epoch": 0.18327743544129996, + "grad_norm": 1.5947151184082031, + "learning_rate": 5e-05, + "loss": 2.2077, + "step": 177000 + }, + { + "epoch": 0.18431290117825647, + "grad_norm": 1.47206711769104, + "learning_rate": 5e-05, + "loss": 2.2087, + "step": 178000 + }, + { + "epoch": 0.18534836691521295, + "grad_norm": 1.5847997665405273, + "learning_rate": 5e-05, + "loss": 2.2198, + "step": 179000 + }, + { + "epoch": 0.18638383265216946, + "grad_norm": 1.543878436088562, + "learning_rate": 5e-05, + "loss": 2.198, + "step": 180000 + }, + { + "epoch": 0.18638383265216946, + "eval_loss": 2.1845173835754395, + "eval_runtime": 19.3852, + "eval_samples_per_second": 2551.271, + "eval_steps_per_second": 10.008, + "step": 180000 + }, + { + "epoch": 0.18741929838912597, + "grad_norm": 2.046430826187134, + "learning_rate": 5e-05, + "loss": 2.1924, + "step": 181000 + }, + { + "epoch": 0.18845476412608245, + "grad_norm": 1.3812047243118286, + "learning_rate": 5e-05, + "loss": 2.1962, + "step": 182000 + }, + { + "epoch": 0.18949022986303896, + "grad_norm": 2.3460304737091064, + "learning_rate": 5e-05, + "loss": 2.2018, + "step": 183000 + }, + { + "epoch": 0.19052569559999544, + "grad_norm": 1.5330251455307007, + "learning_rate": 5e-05, + "loss": 2.1828, + "step": 184000 + }, + { + "epoch": 0.19156116133695195, + "grad_norm": 1.693895697593689, + "learning_rate": 5e-05, + "loss": 2.1763, + "step": 185000 + }, + { + "epoch": 0.19156116133695195, + "eval_loss": 2.172133684158325, + "eval_runtime": 19.1958, + "eval_samples_per_second": 2576.448, + "eval_steps_per_second": 10.106, + "step": 185000 + }, + { + "epoch": 0.19259662707390843, + "grad_norm": 1.7285739183425903, + "learning_rate": 5e-05, + "loss": 2.197, + "step": 186000 + }, + { + "epoch": 0.19363209281086494, + "grad_norm": 1.8407094478607178, + "learning_rate": 5e-05, + "loss": 2.1856, + "step": 187000 + }, + { + "epoch": 0.19466755854782145, + "grad_norm": 1.7621251344680786, + "learning_rate": 5e-05, + "loss": 2.1986, + "step": 188000 + }, + { + "epoch": 0.19570302428477793, + "grad_norm": 1.4718973636627197, + "learning_rate": 5e-05, + "loss": 2.1729, + "step": 189000 + }, + { + "epoch": 0.19673849002173444, + "grad_norm": 2.0074100494384766, + "learning_rate": 5e-05, + "loss": 2.1893, + "step": 190000 + }, + { + "epoch": 0.19673849002173444, + "eval_loss": 2.172874927520752, + "eval_runtime": 18.8943, + "eval_samples_per_second": 2617.559, + "eval_steps_per_second": 10.268, + "step": 190000 + }, + { + "epoch": 0.19777395575869092, + "grad_norm": 1.919875144958496, + "learning_rate": 5e-05, + "loss": 2.1847, + "step": 191000 + }, + { + "epoch": 0.19880942149564743, + "grad_norm": 1.9104081392288208, + "learning_rate": 5e-05, + "loss": 2.1772, + "step": 192000 + }, + { + "epoch": 0.1998448872326039, + "grad_norm": 1.5770833492279053, + "learning_rate": 5e-05, + "loss": 2.1737, + "step": 193000 + }, + { + "epoch": 0.20088035296956042, + "grad_norm": 1.6979726552963257, + "learning_rate": 5e-05, + "loss": 2.1754, + "step": 194000 + }, + { + "epoch": 0.2019158187065169, + "grad_norm": 1.5536943674087524, + "learning_rate": 5e-05, + "loss": 2.1688, + "step": 195000 + }, + { + "epoch": 0.2019158187065169, + "eval_loss": 2.163553476333618, + "eval_runtime": 21.8229, + "eval_samples_per_second": 2266.291, + "eval_steps_per_second": 8.89, + "step": 195000 + }, + { + "epoch": 0.2029512844434734, + "grad_norm": 1.6107367277145386, + "learning_rate": 5e-05, + "loss": 2.1824, + "step": 196000 + }, + { + "epoch": 0.20398675018042992, + "grad_norm": 1.4322165250778198, + "learning_rate": 5e-05, + "loss": 2.1698, + "step": 197000 + }, + { + "epoch": 0.2050222159173864, + "grad_norm": 1.6819088459014893, + "learning_rate": 5e-05, + "loss": 2.1653, + "step": 198000 + }, + { + "epoch": 0.2060576816543429, + "grad_norm": 1.6152656078338623, + "learning_rate": 5e-05, + "loss": 2.1617, + "step": 199000 + }, + { + "epoch": 0.2070931473912994, + "grad_norm": 1.5925724506378174, + "learning_rate": 5e-05, + "loss": 2.1628, + "step": 200000 + }, + { + "epoch": 0.2070931473912994, + "eval_loss": 2.1573803424835205, + "eval_runtime": 19.1084, + "eval_samples_per_second": 2588.231, + "eval_steps_per_second": 10.153, + "step": 200000 + }, + { + "epoch": 0.2081286131282559, + "grad_norm": 2.013730049133301, + "learning_rate": 5e-05, + "loss": 2.1803, + "step": 201000 + }, + { + "epoch": 0.20916407886521238, + "grad_norm": 1.5648040771484375, + "learning_rate": 5e-05, + "loss": 2.1807, + "step": 202000 + }, + { + "epoch": 0.2101995446021689, + "grad_norm": 1.5546032190322876, + "learning_rate": 5e-05, + "loss": 2.1536, + "step": 203000 + }, + { + "epoch": 0.2112350103391254, + "grad_norm": 1.5997415781021118, + "learning_rate": 5e-05, + "loss": 2.1622, + "step": 204000 + }, + { + "epoch": 0.21227047607608188, + "grad_norm": 1.656101942062378, + "learning_rate": 5e-05, + "loss": 2.1599, + "step": 205000 + }, + { + "epoch": 0.21227047607608188, + "eval_loss": 2.156127452850342, + "eval_runtime": 19.0436, + "eval_samples_per_second": 2597.044, + "eval_steps_per_second": 10.187, + "step": 205000 + }, + { + "epoch": 0.21330594181303839, + "grad_norm": 1.6793824434280396, + "learning_rate": 5e-05, + "loss": 2.1545, + "step": 206000 + }, + { + "epoch": 0.21434140754999487, + "grad_norm": 2.1217029094696045, + "learning_rate": 5e-05, + "loss": 2.1508, + "step": 207000 + }, + { + "epoch": 0.21537687328695138, + "grad_norm": 1.2719789743423462, + "learning_rate": 5e-05, + "loss": 2.1573, + "step": 208000 + }, + { + "epoch": 0.21641233902390786, + "grad_norm": 1.5386024713516235, + "learning_rate": 5e-05, + "loss": 2.1478, + "step": 209000 + }, + { + "epoch": 0.21744780476086437, + "grad_norm": 2.2390332221984863, + "learning_rate": 5e-05, + "loss": 2.1593, + "step": 210000 + }, + { + "epoch": 0.21744780476086437, + "eval_loss": 2.1487836837768555, + "eval_runtime": 18.2274, + "eval_samples_per_second": 2713.328, + "eval_steps_per_second": 10.643, + "step": 210000 + }, + { + "epoch": 0.21848327049782088, + "grad_norm": 1.6317909955978394, + "learning_rate": 5e-05, + "loss": 2.1578, + "step": 211000 + }, + { + "epoch": 0.21951873623477736, + "grad_norm": 1.6397149562835693, + "learning_rate": 5e-05, + "loss": 2.1567, + "step": 212000 + }, + { + "epoch": 0.22055420197173387, + "grad_norm": 1.6141470670700073, + "learning_rate": 5e-05, + "loss": 2.1493, + "step": 213000 + }, + { + "epoch": 0.22158966770869035, + "grad_norm": 1.6077841520309448, + "learning_rate": 5e-05, + "loss": 2.1383, + "step": 214000 + }, + { + "epoch": 0.22262513344564686, + "grad_norm": 1.4669524431228638, + "learning_rate": 5e-05, + "loss": 2.1449, + "step": 215000 + }, + { + "epoch": 0.22262513344564686, + "eval_loss": 2.1384260654449463, + "eval_runtime": 18.3245, + "eval_samples_per_second": 2698.962, + "eval_steps_per_second": 10.587, + "step": 215000 + }, + { + "epoch": 0.22366059918260334, + "grad_norm": 1.6204700469970703, + "learning_rate": 5e-05, + "loss": 2.1601, + "step": 216000 + }, + { + "epoch": 0.22469606491955985, + "grad_norm": 1.33708655834198, + "learning_rate": 5e-05, + "loss": 2.1419, + "step": 217000 + }, + { + "epoch": 0.22573153065651635, + "grad_norm": 1.765945315361023, + "learning_rate": 5e-05, + "loss": 2.1406, + "step": 218000 + }, + { + "epoch": 0.22676699639347284, + "grad_norm": 1.6463714838027954, + "learning_rate": 5e-05, + "loss": 2.1533, + "step": 219000 + }, + { + "epoch": 0.22780246213042935, + "grad_norm": 1.5012264251708984, + "learning_rate": 5e-05, + "loss": 2.1446, + "step": 220000 + }, + { + "epoch": 0.22780246213042935, + "eval_loss": 2.1217093467712402, + "eval_runtime": 27.5974, + "eval_samples_per_second": 1792.091, + "eval_steps_per_second": 7.03, + "step": 220000 + }, + { + "epoch": 0.22883792786738583, + "grad_norm": 1.6473504304885864, + "learning_rate": 5e-05, + "loss": 2.1512, + "step": 221000 + }, + { + "epoch": 0.22987339360434234, + "grad_norm": 1.60206139087677, + "learning_rate": 5e-05, + "loss": 2.1426, + "step": 222000 + }, + { + "epoch": 0.23090885934129882, + "grad_norm": 1.8370537757873535, + "learning_rate": 5e-05, + "loss": 2.1438, + "step": 223000 + }, + { + "epoch": 0.23194432507825533, + "grad_norm": 1.3373557329177856, + "learning_rate": 5e-05, + "loss": 2.1315, + "step": 224000 + }, + { + "epoch": 0.2329797908152118, + "grad_norm": 1.6836400032043457, + "learning_rate": 5e-05, + "loss": 2.1354, + "step": 225000 + }, + { + "epoch": 0.2329797908152118, + "eval_loss": 2.125302791595459, + "eval_runtime": 22.4712, + "eval_samples_per_second": 2200.902, + "eval_steps_per_second": 8.633, + "step": 225000 + }, + { + "epoch": 0.23401525655216832, + "grad_norm": 1.7297096252441406, + "learning_rate": 5e-05, + "loss": 2.1325, + "step": 226000 + }, + { + "epoch": 0.23505072228912482, + "grad_norm": 1.4455069303512573, + "learning_rate": 5e-05, + "loss": 2.1299, + "step": 227000 + }, + { + "epoch": 0.2360861880260813, + "grad_norm": 1.429310917854309, + "learning_rate": 5e-05, + "loss": 2.1457, + "step": 228000 + }, + { + "epoch": 0.23712165376303782, + "grad_norm": 1.7447959184646606, + "learning_rate": 5e-05, + "loss": 2.1349, + "step": 229000 + }, + { + "epoch": 0.2381571194999943, + "grad_norm": 1.6983305215835571, + "learning_rate": 5e-05, + "loss": 2.1201, + "step": 230000 + }, + { + "epoch": 0.2381571194999943, + "eval_loss": 2.1202392578125, + "eval_runtime": 19.1303, + "eval_samples_per_second": 2585.264, + "eval_steps_per_second": 10.141, + "step": 230000 + }, + { + "epoch": 0.2391925852369508, + "grad_norm": 1.5676405429840088, + "learning_rate": 5e-05, + "loss": 2.1336, + "step": 231000 + }, + { + "epoch": 0.2402280509739073, + "grad_norm": 1.6670252084732056, + "learning_rate": 5e-05, + "loss": 2.1187, + "step": 232000 + }, + { + "epoch": 0.2412635167108638, + "grad_norm": 1.5296714305877686, + "learning_rate": 5e-05, + "loss": 2.125, + "step": 233000 + }, + { + "epoch": 0.2422989824478203, + "grad_norm": 1.5627477169036865, + "learning_rate": 5e-05, + "loss": 2.121, + "step": 234000 + }, + { + "epoch": 0.24333444818477679, + "grad_norm": 2.2698166370391846, + "learning_rate": 5e-05, + "loss": 2.1426, + "step": 235000 + }, + { + "epoch": 0.24333444818477679, + "eval_loss": 2.1195831298828125, + "eval_runtime": 18.9698, + "eval_samples_per_second": 2607.149, + "eval_steps_per_second": 10.227, + "step": 235000 + }, + { + "epoch": 0.2443699139217333, + "grad_norm": 1.9312177896499634, + "learning_rate": 5e-05, + "loss": 2.1289, + "step": 236000 + }, + { + "epoch": 0.24540537965868978, + "grad_norm": 1.6321558952331543, + "learning_rate": 5e-05, + "loss": 2.1236, + "step": 237000 + }, + { + "epoch": 0.24644084539564629, + "grad_norm": 1.7519464492797852, + "learning_rate": 5e-05, + "loss": 2.1175, + "step": 238000 + }, + { + "epoch": 0.24747631113260277, + "grad_norm": 1.5893385410308838, + "learning_rate": 5e-05, + "loss": 2.1175, + "step": 239000 + }, + { + "epoch": 0.24851177686955928, + "grad_norm": 1.7425572872161865, + "learning_rate": 5e-05, + "loss": 2.1125, + "step": 240000 + }, + { + "epoch": 0.24851177686955928, + "eval_loss": 2.1114916801452637, + "eval_runtime": 18.5753, + "eval_samples_per_second": 2662.508, + "eval_steps_per_second": 10.444, + "step": 240000 + }, + { + "epoch": 0.24954724260651578, + "grad_norm": 1.7862157821655273, + "learning_rate": 5e-05, + "loss": 2.1228, + "step": 241000 + }, + { + "epoch": 0.2505827083434723, + "grad_norm": 1.2887235879898071, + "learning_rate": 5e-05, + "loss": 2.1298, + "step": 242000 + }, + { + "epoch": 0.25161817408042875, + "grad_norm": 1.5348347425460815, + "learning_rate": 5e-05, + "loss": 2.1247, + "step": 243000 + }, + { + "epoch": 0.25265363981738526, + "grad_norm": 1.669761061668396, + "learning_rate": 5e-05, + "loss": 2.1172, + "step": 244000 + }, + { + "epoch": 0.25368910555434177, + "grad_norm": 1.881727933883667, + "learning_rate": 5e-05, + "loss": 2.1118, + "step": 245000 + }, + { + "epoch": 0.25368910555434177, + "eval_loss": 2.103923797607422, + "eval_runtime": 19.0175, + "eval_samples_per_second": 2600.599, + "eval_steps_per_second": 10.201, + "step": 245000 + }, + { + "epoch": 0.2547245712912983, + "grad_norm": 1.744746446609497, + "learning_rate": 5e-05, + "loss": 2.1077, + "step": 246000 + }, + { + "epoch": 0.2557600370282547, + "grad_norm": 1.7865989208221436, + "learning_rate": 5e-05, + "loss": 2.1147, + "step": 247000 + }, + { + "epoch": 0.25679550276521124, + "grad_norm": 1.640703558921814, + "learning_rate": 5e-05, + "loss": 2.1066, + "step": 248000 + }, + { + "epoch": 0.25783096850216775, + "grad_norm": 1.8829026222229004, + "learning_rate": 5e-05, + "loss": 2.1118, + "step": 249000 + }, + { + "epoch": 0.25886643423912425, + "grad_norm": 1.4332600831985474, + "learning_rate": 5e-05, + "loss": 2.1051, + "step": 250000 + }, + { + "epoch": 0.25886643423912425, + "eval_loss": 2.097465753555298, + "eval_runtime": 18.8905, + "eval_samples_per_second": 2618.088, + "eval_steps_per_second": 10.27, + "step": 250000 + }, + { + "epoch": 0.25990189997608076, + "grad_norm": 1.1934382915496826, + "learning_rate": 5e-05, + "loss": 2.1017, + "step": 251000 + }, + { + "epoch": 0.2609373657130372, + "grad_norm": 1.7838383913040161, + "learning_rate": 5e-05, + "loss": 2.1196, + "step": 252000 + }, + { + "epoch": 0.2619728314499937, + "grad_norm": 1.6719322204589844, + "learning_rate": 5e-05, + "loss": 2.1117, + "step": 253000 + }, + { + "epoch": 0.26300829718695024, + "grad_norm": 1.5883870124816895, + "learning_rate": 5e-05, + "loss": 2.1019, + "step": 254000 + }, + { + "epoch": 0.26404376292390674, + "grad_norm": 1.5117872953414917, + "learning_rate": 5e-05, + "loss": 2.1132, + "step": 255000 + }, + { + "epoch": 0.26404376292390674, + "eval_loss": 2.0989463329315186, + "eval_runtime": 21.0632, + "eval_samples_per_second": 2348.034, + "eval_steps_per_second": 9.21, + "step": 255000 + }, + { + "epoch": 0.26507922866086325, + "grad_norm": 1.7486399412155151, + "learning_rate": 5e-05, + "loss": 2.1182, + "step": 256000 + }, + { + "epoch": 0.2661146943978197, + "grad_norm": 1.401721477508545, + "learning_rate": 5e-05, + "loss": 2.1081, + "step": 257000 + }, + { + "epoch": 0.2671501601347762, + "grad_norm": 1.6972362995147705, + "learning_rate": 5e-05, + "loss": 2.0914, + "step": 258000 + }, + { + "epoch": 0.2681856258717327, + "grad_norm": 1.5844149589538574, + "learning_rate": 5e-05, + "loss": 2.0885, + "step": 259000 + }, + { + "epoch": 0.26922109160868923, + "grad_norm": 1.492384910583496, + "learning_rate": 5e-05, + "loss": 2.0961, + "step": 260000 + }, + { + "epoch": 0.26922109160868923, + "eval_loss": 2.090730905532837, + "eval_runtime": 20.2892, + "eval_samples_per_second": 2437.604, + "eval_steps_per_second": 9.562, + "step": 260000 + }, + { + "epoch": 0.2702565573456457, + "grad_norm": 1.7390309572219849, + "learning_rate": 5e-05, + "loss": 2.1005, + "step": 261000 + }, + { + "epoch": 0.2712920230826022, + "grad_norm": 1.9608066082000732, + "learning_rate": 5e-05, + "loss": 2.0958, + "step": 262000 + }, + { + "epoch": 0.2723274888195587, + "grad_norm": 1.6389927864074707, + "learning_rate": 5e-05, + "loss": 2.1028, + "step": 263000 + }, + { + "epoch": 0.2733629545565152, + "grad_norm": 1.43370521068573, + "learning_rate": 5e-05, + "loss": 2.0861, + "step": 264000 + }, + { + "epoch": 0.2743984202934717, + "grad_norm": 1.4951225519180298, + "learning_rate": 5e-05, + "loss": 2.0864, + "step": 265000 + }, + { + "epoch": 0.2743984202934717, + "eval_loss": 2.0853793621063232, + "eval_runtime": 18.9558, + "eval_samples_per_second": 2609.067, + "eval_steps_per_second": 10.234, + "step": 265000 + }, + { + "epoch": 0.2754338860304282, + "grad_norm": 1.507961392402649, + "learning_rate": 5e-05, + "loss": 2.0915, + "step": 266000 + }, + { + "epoch": 0.2764693517673847, + "grad_norm": 1.930547833442688, + "learning_rate": 5e-05, + "loss": 2.0845, + "step": 267000 + }, + { + "epoch": 0.2775048175043412, + "grad_norm": 1.947425127029419, + "learning_rate": 5e-05, + "loss": 2.0861, + "step": 268000 + }, + { + "epoch": 0.2785402832412977, + "grad_norm": 1.6057015657424927, + "learning_rate": 5e-05, + "loss": 2.1003, + "step": 269000 + }, + { + "epoch": 0.27957574897825416, + "grad_norm": 1.6992744207382202, + "learning_rate": 5e-05, + "loss": 2.0844, + "step": 270000 + }, + { + "epoch": 0.27957574897825416, + "eval_loss": 2.0863852500915527, + "eval_runtime": 19.2536, + "eval_samples_per_second": 2568.715, + "eval_steps_per_second": 10.076, + "step": 270000 + }, + { + "epoch": 0.28061121471521067, + "grad_norm": 1.6745386123657227, + "learning_rate": 5e-05, + "loss": 2.0863, + "step": 271000 + }, + { + "epoch": 0.2816466804521672, + "grad_norm": 1.480391502380371, + "learning_rate": 5e-05, + "loss": 2.0832, + "step": 272000 + }, + { + "epoch": 0.2826821461891237, + "grad_norm": 1.7411426305770874, + "learning_rate": 5e-05, + "loss": 2.0885, + "step": 273000 + }, + { + "epoch": 0.2837176119260802, + "grad_norm": 1.5064642429351807, + "learning_rate": 5e-05, + "loss": 2.096, + "step": 274000 + }, + { + "epoch": 0.28475307766303665, + "grad_norm": 1.3575658798217773, + "learning_rate": 5e-05, + "loss": 2.0908, + "step": 275000 + }, + { + "epoch": 0.28475307766303665, + "eval_loss": 2.075481414794922, + "eval_runtime": 21.5208, + "eval_samples_per_second": 2298.108, + "eval_steps_per_second": 9.015, + "step": 275000 + }, + { + "epoch": 0.28578854339999316, + "grad_norm": 1.7280150651931763, + "learning_rate": 5e-05, + "loss": 2.0925, + "step": 276000 + }, + { + "epoch": 0.28682400913694966, + "grad_norm": 1.7355035543441772, + "learning_rate": 5e-05, + "loss": 2.0832, + "step": 277000 + }, + { + "epoch": 0.2878594748739062, + "grad_norm": 1.594399094581604, + "learning_rate": 5e-05, + "loss": 2.0874, + "step": 278000 + }, + { + "epoch": 0.2888949406108627, + "grad_norm": 1.5549061298370361, + "learning_rate": 5e-05, + "loss": 2.0836, + "step": 279000 + }, + { + "epoch": 0.28993040634781914, + "grad_norm": 1.8054208755493164, + "learning_rate": 5e-05, + "loss": 2.0783, + "step": 280000 + }, + { + "epoch": 0.28993040634781914, + "eval_loss": 2.067439317703247, + "eval_runtime": 19.2353, + "eval_samples_per_second": 2571.156, + "eval_steps_per_second": 10.086, + "step": 280000 + }, + { + "epoch": 0.29096587208477565, + "grad_norm": 1.4210501909255981, + "learning_rate": 5e-05, + "loss": 2.0854, + "step": 281000 + }, + { + "epoch": 0.29200133782173215, + "grad_norm": 1.9252066612243652, + "learning_rate": 5e-05, + "loss": 2.083, + "step": 282000 + }, + { + "epoch": 0.29303680355868866, + "grad_norm": 1.6770497560501099, + "learning_rate": 5e-05, + "loss": 2.087, + "step": 283000 + }, + { + "epoch": 0.2940722692956451, + "grad_norm": 1.5951564311981201, + "learning_rate": 5e-05, + "loss": 2.0812, + "step": 284000 + }, + { + "epoch": 0.2951077350326016, + "grad_norm": 1.6986280679702759, + "learning_rate": 5e-05, + "loss": 2.0786, + "step": 285000 + }, + { + "epoch": 0.2951077350326016, + "eval_loss": 2.0688838958740234, + "eval_runtime": 18.9767, + "eval_samples_per_second": 2606.195, + "eval_steps_per_second": 10.223, + "step": 285000 + }, + { + "epoch": 0.29614320076955813, + "grad_norm": 1.6687910556793213, + "learning_rate": 5e-05, + "loss": 2.088, + "step": 286000 + }, + { + "epoch": 0.29717866650651464, + "grad_norm": 1.308746337890625, + "learning_rate": 5e-05, + "loss": 2.0736, + "step": 287000 + }, + { + "epoch": 0.29821413224347115, + "grad_norm": 1.5213390588760376, + "learning_rate": 5e-05, + "loss": 2.0774, + "step": 288000 + }, + { + "epoch": 0.2992495979804276, + "grad_norm": 1.8728256225585938, + "learning_rate": 5e-05, + "loss": 2.0737, + "step": 289000 + }, + { + "epoch": 0.3002850637173841, + "grad_norm": 1.5906774997711182, + "learning_rate": 5e-05, + "loss": 2.0739, + "step": 290000 + }, + { + "epoch": 0.3002850637173841, + "eval_loss": 2.061387538909912, + "eval_runtime": 34.9804, + "eval_samples_per_second": 1413.847, + "eval_steps_per_second": 5.546, + "step": 290000 + }, + { + "epoch": 0.3013205294543406, + "grad_norm": 1.412349820137024, + "learning_rate": 5e-05, + "loss": 2.0672, + "step": 291000 + }, + { + "epoch": 0.30235599519129713, + "grad_norm": 1.9999415874481201, + "learning_rate": 5e-05, + "loss": 2.0611, + "step": 292000 + }, + { + "epoch": 0.3033914609282536, + "grad_norm": 1.4650294780731201, + "learning_rate": 5e-05, + "loss": 2.0698, + "step": 293000 + }, + { + "epoch": 0.3044269266652101, + "grad_norm": 2.1507210731506348, + "learning_rate": 5e-05, + "loss": 2.0636, + "step": 294000 + }, + { + "epoch": 0.3054623924021666, + "grad_norm": 1.7981712818145752, + "learning_rate": 5e-05, + "loss": 2.0701, + "step": 295000 + }, + { + "epoch": 0.3054623924021666, + "eval_loss": 2.0662500858306885, + "eval_runtime": 20.4231, + "eval_samples_per_second": 2421.625, + "eval_steps_per_second": 9.499, + "step": 295000 + }, + { + "epoch": 0.3064978581391231, + "grad_norm": 1.7765494585037231, + "learning_rate": 5e-05, + "loss": 2.0748, + "step": 296000 + }, + { + "epoch": 0.3075333238760796, + "grad_norm": 1.7243373394012451, + "learning_rate": 5e-05, + "loss": 2.0734, + "step": 297000 + }, + { + "epoch": 0.3085687896130361, + "grad_norm": 1.7474713325500488, + "learning_rate": 5e-05, + "loss": 2.0665, + "step": 298000 + }, + { + "epoch": 0.3096042553499926, + "grad_norm": 1.820787787437439, + "learning_rate": 5e-05, + "loss": 2.069, + "step": 299000 + }, + { + "epoch": 0.3106397210869491, + "grad_norm": 1.5175533294677734, + "learning_rate": 5e-05, + "loss": 2.0667, + "step": 300000 + }, + { + "epoch": 0.3106397210869491, + "eval_loss": 2.058307647705078, + "eval_runtime": 20.3497, + "eval_samples_per_second": 2430.357, + "eval_steps_per_second": 9.533, + "step": 300000 + }, + { + "epoch": 0.3116751868239056, + "grad_norm": 1.475821852684021, + "learning_rate": 5e-05, + "loss": 2.0683, + "step": 301000 + }, + { + "epoch": 0.3127106525608621, + "grad_norm": 1.4600526094436646, + "learning_rate": 5e-05, + "loss": 2.0679, + "step": 302000 + }, + { + "epoch": 0.31374611829781857, + "grad_norm": 1.5122923851013184, + "learning_rate": 5e-05, + "loss": 2.0663, + "step": 303000 + }, + { + "epoch": 0.3147815840347751, + "grad_norm": 1.6594598293304443, + "learning_rate": 5e-05, + "loss": 2.0575, + "step": 304000 + }, + { + "epoch": 0.3158170497717316, + "grad_norm": 1.8523794412612915, + "learning_rate": 5e-05, + "loss": 2.0694, + "step": 305000 + }, + { + "epoch": 0.3158170497717316, + "eval_loss": 2.060145616531372, + "eval_runtime": 18.7888, + "eval_samples_per_second": 2632.265, + "eval_steps_per_second": 10.325, + "step": 305000 + }, + { + "epoch": 0.3168525155086881, + "grad_norm": 1.1590874195098877, + "learning_rate": 5e-05, + "loss": 2.0648, + "step": 306000 + }, + { + "epoch": 0.31788798124564455, + "grad_norm": 1.629807472229004, + "learning_rate": 5e-05, + "loss": 2.0632, + "step": 307000 + }, + { + "epoch": 0.31892344698260106, + "grad_norm": 1.2362703084945679, + "learning_rate": 5e-05, + "loss": 2.052, + "step": 308000 + }, + { + "epoch": 0.31995891271955756, + "grad_norm": 1.9399092197418213, + "learning_rate": 5e-05, + "loss": 2.0559, + "step": 309000 + }, + { + "epoch": 0.3209943784565141, + "grad_norm": 1.528998613357544, + "learning_rate": 5e-05, + "loss": 2.0601, + "step": 310000 + }, + { + "epoch": 0.3209943784565141, + "eval_loss": 2.0462470054626465, + "eval_runtime": 19.5, + "eval_samples_per_second": 2536.255, + "eval_steps_per_second": 9.949, + "step": 310000 + }, + { + "epoch": 0.3220298441934706, + "grad_norm": 1.3977261781692505, + "learning_rate": 5e-05, + "loss": 2.0534, + "step": 311000 + }, + { + "epoch": 0.32306530993042704, + "grad_norm": 1.4338626861572266, + "learning_rate": 5e-05, + "loss": 2.0646, + "step": 312000 + }, + { + "epoch": 0.32410077566738354, + "grad_norm": 1.7435204982757568, + "learning_rate": 5e-05, + "loss": 2.0523, + "step": 313000 + }, + { + "epoch": 0.32513624140434005, + "grad_norm": 1.6941169500350952, + "learning_rate": 5e-05, + "loss": 2.0627, + "step": 314000 + }, + { + "epoch": 0.32617170714129656, + "grad_norm": 1.7250036001205444, + "learning_rate": 5e-05, + "loss": 2.0623, + "step": 315000 + }, + { + "epoch": 0.32617170714129656, + "eval_loss": 2.0483994483947754, + "eval_runtime": 20.1553, + "eval_samples_per_second": 2453.798, + "eval_steps_per_second": 9.625, + "step": 315000 + }, + { + "epoch": 0.32720717287825307, + "grad_norm": 1.5633450746536255, + "learning_rate": 5e-05, + "loss": 2.0504, + "step": 316000 + }, + { + "epoch": 0.3282426386152095, + "grad_norm": 1.9888895750045776, + "learning_rate": 5e-05, + "loss": 2.051, + "step": 317000 + }, + { + "epoch": 0.32927810435216603, + "grad_norm": 1.6756165027618408, + "learning_rate": 5e-05, + "loss": 2.0573, + "step": 318000 + }, + { + "epoch": 0.33031357008912254, + "grad_norm": 1.574082851409912, + "learning_rate": 5e-05, + "loss": 2.0576, + "step": 319000 + }, + { + "epoch": 0.33134903582607905, + "grad_norm": 1.7257170677185059, + "learning_rate": 5e-05, + "loss": 2.0676, + "step": 320000 + }, + { + "epoch": 0.33134903582607905, + "eval_loss": 2.0458145141601562, + "eval_runtime": 20.2597, + "eval_samples_per_second": 2441.148, + "eval_steps_per_second": 9.576, + "step": 320000 + }, + { + "epoch": 0.3323845015630355, + "grad_norm": 1.3943850994110107, + "learning_rate": 5e-05, + "loss": 2.046, + "step": 321000 + }, + { + "epoch": 0.333419967299992, + "grad_norm": 1.759932279586792, + "learning_rate": 5e-05, + "loss": 2.0438, + "step": 322000 + }, + { + "epoch": 0.3344554330369485, + "grad_norm": 1.7004214525222778, + "learning_rate": 5e-05, + "loss": 2.0472, + "step": 323000 + }, + { + "epoch": 0.33549089877390503, + "grad_norm": 1.6331087350845337, + "learning_rate": 5e-05, + "loss": 2.0518, + "step": 324000 + }, + { + "epoch": 0.33652636451086154, + "grad_norm": 1.9464713335037231, + "learning_rate": 5e-05, + "loss": 2.0404, + "step": 325000 + }, + { + "epoch": 0.33652636451086154, + "eval_loss": 2.0383198261260986, + "eval_runtime": 19.6827, + "eval_samples_per_second": 2512.719, + "eval_steps_per_second": 9.856, + "step": 325000 + }, + { + "epoch": 0.337561830247818, + "grad_norm": 1.6935348510742188, + "learning_rate": 5e-05, + "loss": 2.0541, + "step": 326000 + }, + { + "epoch": 0.3385972959847745, + "grad_norm": 1.6641876697540283, + "learning_rate": 5e-05, + "loss": 2.0433, + "step": 327000 + }, + { + "epoch": 0.339632761721731, + "grad_norm": 1.943715214729309, + "learning_rate": 5e-05, + "loss": 2.0469, + "step": 328000 + }, + { + "epoch": 0.3406682274586875, + "grad_norm": 1.791492223739624, + "learning_rate": 5e-05, + "loss": 2.0469, + "step": 329000 + }, + { + "epoch": 0.341703693195644, + "grad_norm": 1.4165565967559814, + "learning_rate": 5e-05, + "loss": 2.0403, + "step": 330000 + }, + { + "epoch": 0.341703693195644, + "eval_loss": 2.038299083709717, + "eval_runtime": 19.6239, + "eval_samples_per_second": 2520.239, + "eval_steps_per_second": 9.886, + "step": 330000 + }, + { + "epoch": 0.3427391589326005, + "grad_norm": 1.9569101333618164, + "learning_rate": 5e-05, + "loss": 2.0615, + "step": 331000 + }, + { + "epoch": 0.343774624669557, + "grad_norm": 1.3618338108062744, + "learning_rate": 5e-05, + "loss": 2.0404, + "step": 332000 + }, + { + "epoch": 0.3448100904065135, + "grad_norm": 1.8879374265670776, + "learning_rate": 5e-05, + "loss": 2.0388, + "step": 333000 + }, + { + "epoch": 0.34584555614347, + "grad_norm": 1.2820725440979004, + "learning_rate": 5e-05, + "loss": 2.0477, + "step": 334000 + }, + { + "epoch": 0.34688102188042647, + "grad_norm": 1.3401552438735962, + "learning_rate": 5e-05, + "loss": 2.0438, + "step": 335000 + }, + { + "epoch": 0.34688102188042647, + "eval_loss": 2.0328009128570557, + "eval_runtime": 26.3779, + "eval_samples_per_second": 1874.94, + "eval_steps_per_second": 7.355, + "step": 335000 + }, + { + "epoch": 0.347916487617383, + "grad_norm": 1.425150752067566, + "learning_rate": 5e-05, + "loss": 2.0465, + "step": 336000 + }, + { + "epoch": 0.3489519533543395, + "grad_norm": 1.5219414234161377, + "learning_rate": 5e-05, + "loss": 2.0342, + "step": 337000 + }, + { + "epoch": 0.349987419091296, + "grad_norm": 1.5171096324920654, + "learning_rate": 5e-05, + "loss": 2.0322, + "step": 338000 + }, + { + "epoch": 0.3510228848282525, + "grad_norm": 1.8049135208129883, + "learning_rate": 5e-05, + "loss": 2.0376, + "step": 339000 + }, + { + "epoch": 0.35205835056520896, + "grad_norm": 1.9653050899505615, + "learning_rate": 5e-05, + "loss": 2.0384, + "step": 340000 + }, + { + "epoch": 0.35205835056520896, + "eval_loss": 2.021156072616577, + "eval_runtime": 20.2299, + "eval_samples_per_second": 2444.746, + "eval_steps_per_second": 9.59, + "step": 340000 + }, + { + "epoch": 0.35309381630216546, + "grad_norm": 1.5175777673721313, + "learning_rate": 5e-05, + "loss": 2.0384, + "step": 341000 + }, + { + "epoch": 0.354129282039122, + "grad_norm": 1.6402604579925537, + "learning_rate": 5e-05, + "loss": 2.0403, + "step": 342000 + }, + { + "epoch": 0.3551647477760785, + "grad_norm": 1.7806463241577148, + "learning_rate": 5e-05, + "loss": 2.0435, + "step": 343000 + }, + { + "epoch": 0.35620021351303494, + "grad_norm": 1.825314998626709, + "learning_rate": 5e-05, + "loss": 2.0337, + "step": 344000 + }, + { + "epoch": 0.35723567924999144, + "grad_norm": 1.6913727521896362, + "learning_rate": 5e-05, + "loss": 2.0374, + "step": 345000 + }, + { + "epoch": 0.35723567924999144, + "eval_loss": 2.018705368041992, + "eval_runtime": 19.9799, + "eval_samples_per_second": 2475.341, + "eval_steps_per_second": 9.71, + "step": 345000 + }, + { + "epoch": 0.35827114498694795, + "grad_norm": 1.3637776374816895, + "learning_rate": 5e-05, + "loss": 2.0404, + "step": 346000 + }, + { + "epoch": 0.35930661072390446, + "grad_norm": 1.955057978630066, + "learning_rate": 5e-05, + "loss": 2.0286, + "step": 347000 + }, + { + "epoch": 0.36034207646086097, + "grad_norm": 1.516166090965271, + "learning_rate": 5e-05, + "loss": 2.0311, + "step": 348000 + }, + { + "epoch": 0.3613775421978174, + "grad_norm": 2.4298129081726074, + "learning_rate": 5e-05, + "loss": 2.0348, + "step": 349000 + }, + { + "epoch": 0.36241300793477393, + "grad_norm": 1.5734537839889526, + "learning_rate": 5e-05, + "loss": 2.0333, + "step": 350000 + }, + { + "epoch": 0.36241300793477393, + "eval_loss": 2.019728422164917, + "eval_runtime": 19.4884, + "eval_samples_per_second": 2537.767, + "eval_steps_per_second": 9.955, + "step": 350000 + }, + { + "epoch": 0.36344847367173044, + "grad_norm": 1.2852146625518799, + "learning_rate": 5e-05, + "loss": 2.0381, + "step": 351000 + }, + { + "epoch": 0.36448393940868695, + "grad_norm": 1.62471342086792, + "learning_rate": 5e-05, + "loss": 2.0375, + "step": 352000 + }, + { + "epoch": 0.3655194051456434, + "grad_norm": 1.6946029663085938, + "learning_rate": 5e-05, + "loss": 2.0291, + "step": 353000 + }, + { + "epoch": 0.3665548708825999, + "grad_norm": 1.8819012641906738, + "learning_rate": 5e-05, + "loss": 2.0251, + "step": 354000 + }, + { + "epoch": 0.3675903366195564, + "grad_norm": 1.8506168127059937, + "learning_rate": 5e-05, + "loss": 2.0297, + "step": 355000 + }, + { + "epoch": 0.3675903366195564, + "eval_loss": 2.0166077613830566, + "eval_runtime": 20.1725, + "eval_samples_per_second": 2451.7, + "eval_steps_per_second": 9.617, + "step": 355000 + }, + { + "epoch": 0.36862580235651293, + "grad_norm": 1.6591482162475586, + "learning_rate": 5e-05, + "loss": 2.0279, + "step": 356000 + }, + { + "epoch": 0.36966126809346944, + "grad_norm": 1.9256037473678589, + "learning_rate": 5e-05, + "loss": 2.0314, + "step": 357000 + }, + { + "epoch": 0.3706967338304259, + "grad_norm": 1.5882221460342407, + "learning_rate": 5e-05, + "loss": 2.0365, + "step": 358000 + }, + { + "epoch": 0.3717321995673824, + "grad_norm": 1.8153852224349976, + "learning_rate": 5e-05, + "loss": 2.0201, + "step": 359000 + }, + { + "epoch": 0.3727676653043389, + "grad_norm": 1.7015492916107178, + "learning_rate": 5e-05, + "loss": 2.027, + "step": 360000 + }, + { + "epoch": 0.3727676653043389, + "eval_loss": 2.019824504852295, + "eval_runtime": 20.5579, + "eval_samples_per_second": 2405.737, + "eval_steps_per_second": 9.437, + "step": 360000 + }, + { + "epoch": 0.3738031310412954, + "grad_norm": 1.973598837852478, + "learning_rate": 5e-05, + "loss": 2.0345, + "step": 361000 + }, + { + "epoch": 0.37483859677825193, + "grad_norm": 1.685168981552124, + "learning_rate": 5e-05, + "loss": 2.0336, + "step": 362000 + }, + { + "epoch": 0.3758740625152084, + "grad_norm": 1.4376262426376343, + "learning_rate": 5e-05, + "loss": 2.017, + "step": 363000 + }, + { + "epoch": 0.3769095282521649, + "grad_norm": 1.309055209159851, + "learning_rate": 5e-05, + "loss": 2.0348, + "step": 364000 + }, + { + "epoch": 0.3779449939891214, + "grad_norm": 1.5164344310760498, + "learning_rate": 5e-05, + "loss": 2.034, + "step": 365000 + }, + { + "epoch": 0.3779449939891214, + "eval_loss": 2.0100014209747314, + "eval_runtime": 20.1879, + "eval_samples_per_second": 2449.835, + "eval_steps_per_second": 9.61, + "step": 365000 + }, + { + "epoch": 0.3789804597260779, + "grad_norm": 1.476970911026001, + "learning_rate": 5e-05, + "loss": 2.0214, + "step": 366000 + }, + { + "epoch": 0.38001592546303437, + "grad_norm": 1.4336302280426025, + "learning_rate": 5e-05, + "loss": 2.0237, + "step": 367000 + }, + { + "epoch": 0.3810513911999909, + "grad_norm": 1.7313731908798218, + "learning_rate": 5e-05, + "loss": 2.0215, + "step": 368000 + }, + { + "epoch": 0.3820868569369474, + "grad_norm": 2.014364719390869, + "learning_rate": 5e-05, + "loss": 2.0105, + "step": 369000 + }, + { + "epoch": 0.3831223226739039, + "grad_norm": 1.2657063007354736, + "learning_rate": 5e-05, + "loss": 2.03, + "step": 370000 + }, + { + "epoch": 0.3831223226739039, + "eval_loss": 2.0104730129241943, + "eval_runtime": 19.5262, + "eval_samples_per_second": 2532.856, + "eval_steps_per_second": 9.935, + "step": 370000 + }, + { + "epoch": 0.3841577884108604, + "grad_norm": 2.069312334060669, + "learning_rate": 5e-05, + "loss": 2.0307, + "step": 371000 + }, + { + "epoch": 0.38519325414781685, + "grad_norm": 1.383955955505371, + "learning_rate": 5e-05, + "loss": 2.0294, + "step": 372000 + }, + { + "epoch": 0.38622871988477336, + "grad_norm": 1.489496111869812, + "learning_rate": 5e-05, + "loss": 2.0232, + "step": 373000 + }, + { + "epoch": 0.3872641856217299, + "grad_norm": 1.5222963094711304, + "learning_rate": 5e-05, + "loss": 2.0066, + "step": 374000 + }, + { + "epoch": 0.3882996513586864, + "grad_norm": 1.6142817735671997, + "learning_rate": 5e-05, + "loss": 2.006, + "step": 375000 + }, + { + "epoch": 0.3882996513586864, + "eval_loss": 2.0057575702667236, + "eval_runtime": 20.1185, + "eval_samples_per_second": 2458.285, + "eval_steps_per_second": 9.643, + "step": 375000 + }, + { + "epoch": 0.3893351170956429, + "grad_norm": 1.6987862586975098, + "learning_rate": 5e-05, + "loss": 2.0187, + "step": 376000 + }, + { + "epoch": 0.39037058283259934, + "grad_norm": 1.5701992511749268, + "learning_rate": 5e-05, + "loss": 2.0086, + "step": 377000 + }, + { + "epoch": 0.39140604856955585, + "grad_norm": 1.737766146659851, + "learning_rate": 5e-05, + "loss": 2.0136, + "step": 378000 + }, + { + "epoch": 0.39244151430651236, + "grad_norm": 1.5094623565673828, + "learning_rate": 5e-05, + "loss": 2.0112, + "step": 379000 + }, + { + "epoch": 0.39347698004346887, + "grad_norm": 1.720401644706726, + "learning_rate": 5e-05, + "loss": 2.0301, + "step": 380000 + }, + { + "epoch": 0.39347698004346887, + "eval_loss": 2.005438804626465, + "eval_runtime": 21.5272, + "eval_samples_per_second": 2297.424, + "eval_steps_per_second": 9.012, + "step": 380000 + }, + { + "epoch": 0.3945124457804253, + "grad_norm": 1.6071189641952515, + "learning_rate": 5e-05, + "loss": 2.0186, + "step": 381000 + }, + { + "epoch": 0.39554791151738183, + "grad_norm": 1.3733688592910767, + "learning_rate": 5e-05, + "loss": 2.0177, + "step": 382000 + }, + { + "epoch": 0.39658337725433834, + "grad_norm": 1.7012932300567627, + "learning_rate": 5e-05, + "loss": 2.0125, + "step": 383000 + }, + { + "epoch": 0.39761884299129485, + "grad_norm": 1.656907320022583, + "learning_rate": 5e-05, + "loss": 2.0204, + "step": 384000 + }, + { + "epoch": 0.39865430872825136, + "grad_norm": 1.859150767326355, + "learning_rate": 5e-05, + "loss": 2.0105, + "step": 385000 + }, + { + "epoch": 0.39865430872825136, + "eval_loss": 2.0000715255737305, + "eval_runtime": 20.4022, + "eval_samples_per_second": 2424.105, + "eval_steps_per_second": 9.509, + "step": 385000 + }, + { + "epoch": 0.3996897744652078, + "grad_norm": 1.4494318962097168, + "learning_rate": 5e-05, + "loss": 2.0109, + "step": 386000 + }, + { + "epoch": 0.4007252402021643, + "grad_norm": 1.7626131772994995, + "learning_rate": 5e-05, + "loss": 2.0249, + "step": 387000 + }, + { + "epoch": 0.40176070593912083, + "grad_norm": 1.2615996599197388, + "learning_rate": 5e-05, + "loss": 2.0044, + "step": 388000 + }, + { + "epoch": 0.40279617167607734, + "grad_norm": 2.000866174697876, + "learning_rate": 5e-05, + "loss": 1.9996, + "step": 389000 + }, + { + "epoch": 0.4038316374130338, + "grad_norm": 1.5822980403900146, + "learning_rate": 5e-05, + "loss": 2.0113, + "step": 390000 + }, + { + "epoch": 0.4038316374130338, + "eval_loss": 2.0007712841033936, + "eval_runtime": 19.8198, + "eval_samples_per_second": 2495.337, + "eval_steps_per_second": 9.788, + "step": 390000 + }, + { + "epoch": 0.4048671031499903, + "grad_norm": 1.8089967966079712, + "learning_rate": 5e-05, + "loss": 2.0017, + "step": 391000 + }, + { + "epoch": 0.4059025688869468, + "grad_norm": 2.0939323902130127, + "learning_rate": 5e-05, + "loss": 2.017, + "step": 392000 + }, + { + "epoch": 0.4069380346239033, + "grad_norm": 1.581272840499878, + "learning_rate": 5e-05, + "loss": 2.0061, + "step": 393000 + }, + { + "epoch": 0.40797350036085983, + "grad_norm": 1.3183523416519165, + "learning_rate": 5e-05, + "loss": 2.0044, + "step": 394000 + }, + { + "epoch": 0.4090089660978163, + "grad_norm": 1.9399663209915161, + "learning_rate": 5e-05, + "loss": 2.0, + "step": 395000 + }, + { + "epoch": 0.4090089660978163, + "eval_loss": 1.9924626350402832, + "eval_runtime": 20.3614, + "eval_samples_per_second": 2428.953, + "eval_steps_per_second": 9.528, + "step": 395000 + }, + { + "epoch": 0.4100444318347728, + "grad_norm": 1.667758584022522, + "learning_rate": 5e-05, + "loss": 2.0093, + "step": 396000 + }, + { + "epoch": 0.4110798975717293, + "grad_norm": 2.133141040802002, + "learning_rate": 5e-05, + "loss": 2.0159, + "step": 397000 + }, + { + "epoch": 0.4121153633086858, + "grad_norm": 1.440760612487793, + "learning_rate": 5e-05, + "loss": 2.0047, + "step": 398000 + }, + { + "epoch": 0.4131508290456423, + "grad_norm": 1.7397841215133667, + "learning_rate": 5e-05, + "loss": 1.9918, + "step": 399000 + }, + { + "epoch": 0.4141862947825988, + "grad_norm": 1.8425333499908447, + "learning_rate": 5e-05, + "loss": 2.0072, + "step": 400000 + }, + { + "epoch": 0.4141862947825988, + "eval_loss": 1.9991180896759033, + "eval_runtime": 20.8264, + "eval_samples_per_second": 2374.73, + "eval_steps_per_second": 9.315, + "step": 400000 + }, + { + "epoch": 0.4152217605195553, + "grad_norm": 1.6759531497955322, + "learning_rate": 5e-05, + "loss": 2.0059, + "step": 401000 + }, + { + "epoch": 0.4162572262565118, + "grad_norm": 1.6831870079040527, + "learning_rate": 5e-05, + "loss": 1.9958, + "step": 402000 + }, + { + "epoch": 0.4172926919934683, + "grad_norm": 1.5509451627731323, + "learning_rate": 5e-05, + "loss": 2.0137, + "step": 403000 + }, + { + "epoch": 0.41832815773042475, + "grad_norm": 1.3839584589004517, + "learning_rate": 5e-05, + "loss": 1.9961, + "step": 404000 + }, + { + "epoch": 0.41936362346738126, + "grad_norm": 1.8058265447616577, + "learning_rate": 5e-05, + "loss": 1.9999, + "step": 405000 + }, + { + "epoch": 0.41936362346738126, + "eval_loss": 1.9969453811645508, + "eval_runtime": 20.4837, + "eval_samples_per_second": 2414.454, + "eval_steps_per_second": 9.471, + "step": 405000 + }, + { + "epoch": 0.4203990892043378, + "grad_norm": 2.366974353790283, + "learning_rate": 5e-05, + "loss": 2.0044, + "step": 406000 + }, + { + "epoch": 0.4214345549412943, + "grad_norm": 1.7304997444152832, + "learning_rate": 5e-05, + "loss": 2.0082, + "step": 407000 + }, + { + "epoch": 0.4224700206782508, + "grad_norm": 1.6823433637619019, + "learning_rate": 5e-05, + "loss": 2.015, + "step": 408000 + }, + { + "epoch": 0.42350548641520724, + "grad_norm": 1.8099905252456665, + "learning_rate": 5e-05, + "loss": 2.0005, + "step": 409000 + }, + { + "epoch": 0.42454095215216375, + "grad_norm": 1.789655089378357, + "learning_rate": 5e-05, + "loss": 1.986, + "step": 410000 + }, + { + "epoch": 0.42454095215216375, + "eval_loss": 1.9884740114212036, + "eval_runtime": 20.7019, + "eval_samples_per_second": 2389.009, + "eval_steps_per_second": 9.371, + "step": 410000 + }, + { + "epoch": 0.42557641788912026, + "grad_norm": 1.3981925249099731, + "learning_rate": 5e-05, + "loss": 2.0071, + "step": 411000 + }, + { + "epoch": 0.42661188362607677, + "grad_norm": 1.5761098861694336, + "learning_rate": 5e-05, + "loss": 1.9996, + "step": 412000 + }, + { + "epoch": 0.4276473493630332, + "grad_norm": 1.5765936374664307, + "learning_rate": 5e-05, + "loss": 1.9979, + "step": 413000 + }, + { + "epoch": 0.42868281509998973, + "grad_norm": 1.5242362022399902, + "learning_rate": 5e-05, + "loss": 1.9872, + "step": 414000 + }, + { + "epoch": 0.42971828083694624, + "grad_norm": 1.7076730728149414, + "learning_rate": 5e-05, + "loss": 2.003, + "step": 415000 + }, + { + "epoch": 0.42971828083694624, + "eval_loss": 1.9865856170654297, + "eval_runtime": 19.7883, + "eval_samples_per_second": 2499.301, + "eval_steps_per_second": 9.804, + "step": 415000 + }, + { + "epoch": 0.43075374657390275, + "grad_norm": 1.5453383922576904, + "learning_rate": 5e-05, + "loss": 2.0, + "step": 416000 + }, + { + "epoch": 0.43178921231085926, + "grad_norm": 1.9326260089874268, + "learning_rate": 5e-05, + "loss": 2.0007, + "step": 417000 + }, + { + "epoch": 0.4328246780478157, + "grad_norm": 1.6275213956832886, + "learning_rate": 5e-05, + "loss": 1.99, + "step": 418000 + }, + { + "epoch": 0.4338601437847722, + "grad_norm": 1.4359986782073975, + "learning_rate": 5e-05, + "loss": 1.9895, + "step": 419000 + }, + { + "epoch": 0.43489560952172873, + "grad_norm": 1.7742505073547363, + "learning_rate": 5e-05, + "loss": 2.0013, + "step": 420000 + }, + { + "epoch": 0.43489560952172873, + "eval_loss": 1.9821418523788452, + "eval_runtime": 20.3362, + "eval_samples_per_second": 2431.974, + "eval_steps_per_second": 9.54, + "step": 420000 + }, + { + "epoch": 0.43593107525868524, + "grad_norm": 1.7934837341308594, + "learning_rate": 5e-05, + "loss": 1.9856, + "step": 421000 + }, + { + "epoch": 0.43696654099564175, + "grad_norm": 1.4223476648330688, + "learning_rate": 5e-05, + "loss": 1.9922, + "step": 422000 + }, + { + "epoch": 0.4380020067325982, + "grad_norm": 1.4957746267318726, + "learning_rate": 5e-05, + "loss": 1.9842, + "step": 423000 + }, + { + "epoch": 0.4390374724695547, + "grad_norm": 1.7151662111282349, + "learning_rate": 5e-05, + "loss": 1.9929, + "step": 424000 + }, + { + "epoch": 0.4400729382065112, + "grad_norm": 1.6285368204116821, + "learning_rate": 5e-05, + "loss": 1.9955, + "step": 425000 + }, + { + "epoch": 0.4400729382065112, + "eval_loss": 1.9809545278549194, + "eval_runtime": 20.6817, + "eval_samples_per_second": 2391.337, + "eval_steps_per_second": 9.38, + "step": 425000 + }, + { + "epoch": 0.44110840394346773, + "grad_norm": 1.7479044198989868, + "learning_rate": 5e-05, + "loss": 1.9876, + "step": 426000 + }, + { + "epoch": 0.4421438696804242, + "grad_norm": 1.6902720928192139, + "learning_rate": 5e-05, + "loss": 1.9939, + "step": 427000 + }, + { + "epoch": 0.4431793354173807, + "grad_norm": 1.5723377466201782, + "learning_rate": 5e-05, + "loss": 1.9845, + "step": 428000 + }, + { + "epoch": 0.4442148011543372, + "grad_norm": 1.6776286363601685, + "learning_rate": 5e-05, + "loss": 1.9852, + "step": 429000 + }, + { + "epoch": 0.4452502668912937, + "grad_norm": 2.08964204788208, + "learning_rate": 5e-05, + "loss": 1.997, + "step": 430000 + }, + { + "epoch": 0.4452502668912937, + "eval_loss": 1.9794005155563354, + "eval_runtime": 21.8926, + "eval_samples_per_second": 2259.078, + "eval_steps_per_second": 8.861, + "step": 430000 + }, + { + "epoch": 0.4462857326282502, + "grad_norm": 1.450403094291687, + "learning_rate": 5e-05, + "loss": 1.9929, + "step": 431000 + }, + { + "epoch": 0.4473211983652067, + "grad_norm": 1.9465093612670898, + "learning_rate": 5e-05, + "loss": 1.982, + "step": 432000 + }, + { + "epoch": 0.4483566641021632, + "grad_norm": 1.9567445516586304, + "learning_rate": 5e-05, + "loss": 1.9865, + "step": 433000 + }, + { + "epoch": 0.4493921298391197, + "grad_norm": 1.8487781286239624, + "learning_rate": 5e-05, + "loss": 1.9861, + "step": 434000 + }, + { + "epoch": 0.4504275955760762, + "grad_norm": 1.77712082862854, + "learning_rate": 5e-05, + "loss": 1.9831, + "step": 435000 + }, + { + "epoch": 0.4504275955760762, + "eval_loss": 1.9783129692077637, + "eval_runtime": 20.7598, + "eval_samples_per_second": 2382.35, + "eval_steps_per_second": 9.345, + "step": 435000 + }, + { + "epoch": 0.4514630613130327, + "grad_norm": 1.4690871238708496, + "learning_rate": 5e-05, + "loss": 1.984, + "step": 436000 + }, + { + "epoch": 0.45249852704998916, + "grad_norm": 1.974969506263733, + "learning_rate": 5e-05, + "loss": 1.9857, + "step": 437000 + }, + { + "epoch": 0.45353399278694567, + "grad_norm": 2.069812774658203, + "learning_rate": 5e-05, + "loss": 1.9851, + "step": 438000 + }, + { + "epoch": 0.4545694585239022, + "grad_norm": 1.603868842124939, + "learning_rate": 5e-05, + "loss": 1.9768, + "step": 439000 + }, + { + "epoch": 0.4556049242608587, + "grad_norm": 1.8147460222244263, + "learning_rate": 5e-05, + "loss": 1.9948, + "step": 440000 + }, + { + "epoch": 0.4556049242608587, + "eval_loss": 1.9627678394317627, + "eval_runtime": 20.2676, + "eval_samples_per_second": 2440.206, + "eval_steps_per_second": 9.572, + "step": 440000 + }, + { + "epoch": 0.45664038999781514, + "grad_norm": 1.8408571481704712, + "learning_rate": 5e-05, + "loss": 1.9941, + "step": 441000 + }, + { + "epoch": 0.45767585573477165, + "grad_norm": 1.523547649383545, + "learning_rate": 5e-05, + "loss": 1.9889, + "step": 442000 + }, + { + "epoch": 0.45871132147172816, + "grad_norm": 1.6914969682693481, + "learning_rate": 5e-05, + "loss": 1.9828, + "step": 443000 + }, + { + "epoch": 0.45974678720868467, + "grad_norm": 1.5032548904418945, + "learning_rate": 5e-05, + "loss": 1.9863, + "step": 444000 + }, + { + "epoch": 0.4607822529456412, + "grad_norm": 1.5079525709152222, + "learning_rate": 5e-05, + "loss": 1.9828, + "step": 445000 + }, + { + "epoch": 0.4607822529456412, + "eval_loss": 1.9710944890975952, + "eval_runtime": 20.9498, + "eval_samples_per_second": 2360.739, + "eval_steps_per_second": 9.26, + "step": 445000 + }, + { + "epoch": 0.46181771868259763, + "grad_norm": 1.5467554330825806, + "learning_rate": 5e-05, + "loss": 1.9786, + "step": 446000 + }, + { + "epoch": 0.46285318441955414, + "grad_norm": 1.4302737712860107, + "learning_rate": 5e-05, + "loss": 1.988, + "step": 447000 + }, + { + "epoch": 0.46388865015651065, + "grad_norm": 1.6843082904815674, + "learning_rate": 5e-05, + "loss": 1.9729, + "step": 448000 + }, + { + "epoch": 0.46492411589346716, + "grad_norm": 1.6686064004898071, + "learning_rate": 5e-05, + "loss": 1.9856, + "step": 449000 + }, + { + "epoch": 0.4659595816304236, + "grad_norm": 1.7648873329162598, + "learning_rate": 5e-05, + "loss": 1.977, + "step": 450000 + }, + { + "epoch": 0.4659595816304236, + "eval_loss": 1.9715555906295776, + "eval_runtime": 20.5454, + "eval_samples_per_second": 2407.206, + "eval_steps_per_second": 9.443, + "step": 450000 + }, + { + "epoch": 0.4669950473673801, + "grad_norm": 1.4987176656723022, + "learning_rate": 5e-05, + "loss": 1.9777, + "step": 451000 + }, + { + "epoch": 0.46803051310433663, + "grad_norm": 1.6089203357696533, + "learning_rate": 5e-05, + "loss": 1.9803, + "step": 452000 + }, + { + "epoch": 0.46906597884129314, + "grad_norm": 1.430272102355957, + "learning_rate": 5e-05, + "loss": 1.9824, + "step": 453000 + }, + { + "epoch": 0.47010144457824965, + "grad_norm": 1.4526315927505493, + "learning_rate": 5e-05, + "loss": 1.9811, + "step": 454000 + }, + { + "epoch": 0.4711369103152061, + "grad_norm": 1.6255720853805542, + "learning_rate": 5e-05, + "loss": 1.9749, + "step": 455000 + }, + { + "epoch": 0.4711369103152061, + "eval_loss": 1.972861409187317, + "eval_runtime": 21.2643, + "eval_samples_per_second": 2325.821, + "eval_steps_per_second": 9.123, + "step": 455000 + }, + { + "epoch": 0.4721723760521626, + "grad_norm": 1.470329761505127, + "learning_rate": 5e-05, + "loss": 1.9825, + "step": 456000 + }, + { + "epoch": 0.4732078417891191, + "grad_norm": 1.5077927112579346, + "learning_rate": 5e-05, + "loss": 1.9826, + "step": 457000 + }, + { + "epoch": 0.47424330752607563, + "grad_norm": 1.5065680742263794, + "learning_rate": 5e-05, + "loss": 1.9794, + "step": 458000 + }, + { + "epoch": 0.47527877326303214, + "grad_norm": 1.6117650270462036, + "learning_rate": 5e-05, + "loss": 1.9769, + "step": 459000 + }, + { + "epoch": 0.4763142389999886, + "grad_norm": 1.5135180950164795, + "learning_rate": 5e-05, + "loss": 1.9709, + "step": 460000 + }, + { + "epoch": 0.4763142389999886, + "eval_loss": 1.9691742658615112, + "eval_runtime": 21.5566, + "eval_samples_per_second": 2294.281, + "eval_steps_per_second": 9.0, + "step": 460000 + }, + { + "epoch": 0.4773497047369451, + "grad_norm": 1.9140368700027466, + "learning_rate": 5e-05, + "loss": 1.9724, + "step": 461000 + }, + { + "epoch": 0.4783851704739016, + "grad_norm": 1.59769606590271, + "learning_rate": 5e-05, + "loss": 1.9773, + "step": 462000 + }, + { + "epoch": 0.4794206362108581, + "grad_norm": 1.7482682466506958, + "learning_rate": 5e-05, + "loss": 1.9668, + "step": 463000 + }, + { + "epoch": 0.4804561019478146, + "grad_norm": 1.3569929599761963, + "learning_rate": 5e-05, + "loss": 1.9759, + "step": 464000 + }, + { + "epoch": 0.4814915676847711, + "grad_norm": 1.793421745300293, + "learning_rate": 5e-05, + "loss": 1.9803, + "step": 465000 + }, + { + "epoch": 0.4814915676847711, + "eval_loss": 1.965020775794983, + "eval_runtime": 20.5866, + "eval_samples_per_second": 2402.394, + "eval_steps_per_second": 9.424, + "step": 465000 + }, + { + "epoch": 0.4825270334217276, + "grad_norm": 1.6482970714569092, + "learning_rate": 5e-05, + "loss": 1.9751, + "step": 466000 + }, + { + "epoch": 0.4835624991586841, + "grad_norm": 1.9341484308242798, + "learning_rate": 5e-05, + "loss": 1.9804, + "step": 467000 + }, + { + "epoch": 0.4845979648956406, + "grad_norm": 1.7689942121505737, + "learning_rate": 5e-05, + "loss": 1.9709, + "step": 468000 + }, + { + "epoch": 0.48563343063259706, + "grad_norm": 1.5617495775222778, + "learning_rate": 5e-05, + "loss": 1.9817, + "step": 469000 + }, + { + "epoch": 0.48666889636955357, + "grad_norm": 1.3667575120925903, + "learning_rate": 5e-05, + "loss": 1.9678, + "step": 470000 + }, + { + "epoch": 0.48666889636955357, + "eval_loss": 1.9600496292114258, + "eval_runtime": 20.8747, + "eval_samples_per_second": 2369.229, + "eval_steps_per_second": 9.294, + "step": 470000 + }, + { + "epoch": 0.4877043621065101, + "grad_norm": 1.5237644910812378, + "learning_rate": 5e-05, + "loss": 1.9686, + "step": 471000 + }, + { + "epoch": 0.4887398278434666, + "grad_norm": 1.3905088901519775, + "learning_rate": 5e-05, + "loss": 1.9668, + "step": 472000 + }, + { + "epoch": 0.48977529358042304, + "grad_norm": 1.2724034786224365, + "learning_rate": 5e-05, + "loss": 1.986, + "step": 473000 + }, + { + "epoch": 0.49081075931737955, + "grad_norm": 1.9573471546173096, + "learning_rate": 5e-05, + "loss": 1.9712, + "step": 474000 + }, + { + "epoch": 0.49184622505433606, + "grad_norm": 1.5011438131332397, + "learning_rate": 5e-05, + "loss": 1.972, + "step": 475000 + }, + { + "epoch": 0.49184622505433606, + "eval_loss": 1.9591975212097168, + "eval_runtime": 20.7331, + "eval_samples_per_second": 2385.41, + "eval_steps_per_second": 9.357, + "step": 475000 + }, + { + "epoch": 0.49288169079129257, + "grad_norm": 1.5569186210632324, + "learning_rate": 5e-05, + "loss": 1.9717, + "step": 476000 + }, + { + "epoch": 0.4939171565282491, + "grad_norm": 1.3786282539367676, + "learning_rate": 5e-05, + "loss": 1.9665, + "step": 477000 + }, + { + "epoch": 0.49495262226520553, + "grad_norm": 1.557861328125, + "learning_rate": 5e-05, + "loss": 1.9757, + "step": 478000 + }, + { + "epoch": 0.49598808800216204, + "grad_norm": 1.5497270822525024, + "learning_rate": 5e-05, + "loss": 1.9756, + "step": 479000 + }, + { + "epoch": 0.49702355373911855, + "grad_norm": 1.4066565036773682, + "learning_rate": 5e-05, + "loss": 1.9674, + "step": 480000 + }, + { + "epoch": 0.49702355373911855, + "eval_loss": 1.9571633338928223, + "eval_runtime": 20.9157, + "eval_samples_per_second": 2364.586, + "eval_steps_per_second": 9.275, + "step": 480000 + }, + { + "epoch": 0.49805901947607506, + "grad_norm": 1.862382173538208, + "learning_rate": 5e-05, + "loss": 1.9645, + "step": 481000 + }, + { + "epoch": 0.49909448521303157, + "grad_norm": 1.7556581497192383, + "learning_rate": 5e-05, + "loss": 1.9779, + "step": 482000 + }, + { + "epoch": 0.500129950949988, + "grad_norm": 1.5234323740005493, + "learning_rate": 5e-05, + "loss": 1.9611, + "step": 483000 + }, + { + "epoch": 0.5011654166869446, + "grad_norm": 1.395575761795044, + "learning_rate": 5e-05, + "loss": 1.9707, + "step": 484000 + }, + { + "epoch": 0.502200882423901, + "grad_norm": 1.82569420337677, + "learning_rate": 5e-05, + "loss": 1.9659, + "step": 485000 + }, + { + "epoch": 0.502200882423901, + "eval_loss": 1.960773229598999, + "eval_runtime": 20.3475, + "eval_samples_per_second": 2430.618, + "eval_steps_per_second": 9.534, + "step": 485000 + }, + { + "epoch": 0.5032363481608575, + "grad_norm": 1.3626302480697632, + "learning_rate": 5e-05, + "loss": 1.9606, + "step": 486000 + }, + { + "epoch": 0.5042718138978141, + "grad_norm": 1.4180521965026855, + "learning_rate": 5e-05, + "loss": 1.9689, + "step": 487000 + }, + { + "epoch": 0.5053072796347705, + "grad_norm": 1.9080036878585815, + "learning_rate": 5e-05, + "loss": 1.9618, + "step": 488000 + }, + { + "epoch": 0.5063427453717271, + "grad_norm": 1.3812907934188843, + "learning_rate": 5e-05, + "loss": 1.9663, + "step": 489000 + }, + { + "epoch": 0.5073782111086835, + "grad_norm": 1.6814547777175903, + "learning_rate": 5e-05, + "loss": 1.9612, + "step": 490000 + }, + { + "epoch": 0.5073782111086835, + "eval_loss": 1.9534902572631836, + "eval_runtime": 21.4593, + "eval_samples_per_second": 2304.692, + "eval_steps_per_second": 9.04, + "step": 490000 + }, + { + "epoch": 0.50841367684564, + "grad_norm": 1.7079062461853027, + "learning_rate": 5e-05, + "loss": 1.9644, + "step": 491000 + }, + { + "epoch": 0.5094491425825965, + "grad_norm": 1.8109638690948486, + "learning_rate": 5e-05, + "loss": 1.9602, + "step": 492000 + }, + { + "epoch": 0.510484608319553, + "grad_norm": 1.6124908924102783, + "learning_rate": 5e-05, + "loss": 1.9582, + "step": 493000 + }, + { + "epoch": 0.5115200740565095, + "grad_norm": 1.6076602935791016, + "learning_rate": 5e-05, + "loss": 1.9548, + "step": 494000 + }, + { + "epoch": 0.512555539793466, + "grad_norm": 1.3506944179534912, + "learning_rate": 5e-05, + "loss": 1.9571, + "step": 495000 + }, + { + "epoch": 0.512555539793466, + "eval_loss": 1.9505630731582642, + "eval_runtime": 20.7858, + "eval_samples_per_second": 2379.366, + "eval_steps_per_second": 9.333, + "step": 495000 + }, + { + "epoch": 0.5135910055304225, + "grad_norm": 1.4965591430664062, + "learning_rate": 5e-05, + "loss": 1.9625, + "step": 496000 + }, + { + "epoch": 0.514626471267379, + "grad_norm": 3.6810128688812256, + "learning_rate": 5e-05, + "loss": 1.9616, + "step": 497000 + }, + { + "epoch": 0.5156619370043355, + "grad_norm": 1.7980040311813354, + "learning_rate": 5e-05, + "loss": 1.9685, + "step": 498000 + }, + { + "epoch": 0.516697402741292, + "grad_norm": 2.078582286834717, + "learning_rate": 5e-05, + "loss": 1.9583, + "step": 499000 + }, + { + "epoch": 0.5177328684782485, + "grad_norm": 1.7889291048049927, + "learning_rate": 5e-05, + "loss": 1.9468, + "step": 500000 + }, + { + "epoch": 0.5177328684782485, + "eval_loss": 1.9439510107040405, + "eval_runtime": 20.9843, + "eval_samples_per_second": 2356.859, + "eval_steps_per_second": 9.245, + "step": 500000 + }, + { + "epoch": 0.518768334215205, + "grad_norm": 1.7821800708770752, + "learning_rate": 5e-05, + "loss": 1.9471, + "step": 501000 + }, + { + "epoch": 0.5198037999521615, + "grad_norm": 1.6444828510284424, + "learning_rate": 5e-05, + "loss": 1.957, + "step": 502000 + }, + { + "epoch": 0.520839265689118, + "grad_norm": 1.563770055770874, + "learning_rate": 5e-05, + "loss": 1.96, + "step": 503000 + }, + { + "epoch": 0.5218747314260744, + "grad_norm": 1.7522222995758057, + "learning_rate": 5e-05, + "loss": 1.9566, + "step": 504000 + }, + { + "epoch": 0.522910197163031, + "grad_norm": 1.9874364137649536, + "learning_rate": 5e-05, + "loss": 1.95, + "step": 505000 + }, + { + "epoch": 0.522910197163031, + "eval_loss": 1.9490445852279663, + "eval_runtime": 20.4006, + "eval_samples_per_second": 2424.286, + "eval_steps_per_second": 9.51, + "step": 505000 + }, + { + "epoch": 0.5239456628999875, + "grad_norm": 1.693300724029541, + "learning_rate": 5e-05, + "loss": 1.9627, + "step": 506000 + }, + { + "epoch": 0.524981128636944, + "grad_norm": 1.4723318815231323, + "learning_rate": 5e-05, + "loss": 1.9523, + "step": 507000 + }, + { + "epoch": 0.5260165943739005, + "grad_norm": 1.6969449520111084, + "learning_rate": 5e-05, + "loss": 1.9583, + "step": 508000 + }, + { + "epoch": 0.5270520601108569, + "grad_norm": 2.029672861099243, + "learning_rate": 5e-05, + "loss": 1.9561, + "step": 509000 + }, + { + "epoch": 0.5280875258478135, + "grad_norm": 1.733914852142334, + "learning_rate": 5e-05, + "loss": 1.9458, + "step": 510000 + }, + { + "epoch": 0.5280875258478135, + "eval_loss": 1.947721004486084, + "eval_runtime": 20.9868, + "eval_samples_per_second": 2356.582, + "eval_steps_per_second": 9.244, + "step": 510000 + }, + { + "epoch": 0.5291229915847699, + "grad_norm": 1.4405544996261597, + "learning_rate": 5e-05, + "loss": 1.9605, + "step": 511000 + }, + { + "epoch": 0.5301584573217265, + "grad_norm": 1.559751272201538, + "learning_rate": 5e-05, + "loss": 1.9635, + "step": 512000 + }, + { + "epoch": 0.531193923058683, + "grad_norm": 1.570190191268921, + "learning_rate": 5e-05, + "loss": 1.9599, + "step": 513000 + }, + { + "epoch": 0.5322293887956394, + "grad_norm": 1.7898485660552979, + "learning_rate": 5e-05, + "loss": 1.9665, + "step": 514000 + }, + { + "epoch": 0.533264854532596, + "grad_norm": 1.6552945375442505, + "learning_rate": 5e-05, + "loss": 1.9477, + "step": 515000 + }, + { + "epoch": 0.533264854532596, + "eval_loss": 1.9498939514160156, + "eval_runtime": 20.4739, + "eval_samples_per_second": 2415.616, + "eval_steps_per_second": 9.475, + "step": 515000 + }, + { + "epoch": 0.5343003202695524, + "grad_norm": 1.383664608001709, + "learning_rate": 5e-05, + "loss": 1.9655, + "step": 516000 + }, + { + "epoch": 0.5353357860065089, + "grad_norm": 1.8790479898452759, + "learning_rate": 5e-05, + "loss": 1.9548, + "step": 517000 + }, + { + "epoch": 0.5363712517434654, + "grad_norm": 2.0052480697631836, + "learning_rate": 5e-05, + "loss": 1.9499, + "step": 518000 + }, + { + "epoch": 0.5374067174804219, + "grad_norm": 1.55815589427948, + "learning_rate": 5e-05, + "loss": 1.9489, + "step": 519000 + }, + { + "epoch": 0.5384421832173785, + "grad_norm": 1.6805282831192017, + "learning_rate": 5e-05, + "loss": 1.9517, + "step": 520000 + }, + { + "epoch": 0.5384421832173785, + "eval_loss": 1.9381487369537354, + "eval_runtime": 21.4027, + "eval_samples_per_second": 2310.779, + "eval_steps_per_second": 9.064, + "step": 520000 + }, + { + "epoch": 0.5394776489543349, + "grad_norm": 1.5910086631774902, + "learning_rate": 5e-05, + "loss": 1.9535, + "step": 521000 + }, + { + "epoch": 0.5405131146912914, + "grad_norm": 1.8047144412994385, + "learning_rate": 5e-05, + "loss": 1.9648, + "step": 522000 + }, + { + "epoch": 0.5415485804282479, + "grad_norm": 2.062201976776123, + "learning_rate": 5e-05, + "loss": 1.9476, + "step": 523000 + }, + { + "epoch": 0.5425840461652044, + "grad_norm": 1.7508527040481567, + "learning_rate": 5e-05, + "loss": 1.9569, + "step": 524000 + }, + { + "epoch": 0.543619511902161, + "grad_norm": 1.8220138549804688, + "learning_rate": 5e-05, + "loss": 1.9438, + "step": 525000 + }, + { + "epoch": 0.543619511902161, + "eval_loss": 1.9415658712387085, + "eval_runtime": 20.3248, + "eval_samples_per_second": 2433.336, + "eval_steps_per_second": 9.545, + "step": 525000 + }, + { + "epoch": 0.5446549776391174, + "grad_norm": 1.4566571712493896, + "learning_rate": 5e-05, + "loss": 1.9451, + "step": 526000 + }, + { + "epoch": 0.5456904433760739, + "grad_norm": 1.66045081615448, + "learning_rate": 5e-05, + "loss": 1.947, + "step": 527000 + }, + { + "epoch": 0.5467259091130304, + "grad_norm": 1.6011165380477905, + "learning_rate": 5e-05, + "loss": 1.9504, + "step": 528000 + }, + { + "epoch": 0.5477613748499869, + "grad_norm": 1.4785083532333374, + "learning_rate": 5e-05, + "loss": 1.9418, + "step": 529000 + }, + { + "epoch": 0.5487968405869434, + "grad_norm": 1.584233045578003, + "learning_rate": 5e-05, + "loss": 1.9519, + "step": 530000 + }, + { + "epoch": 0.5487968405869434, + "eval_loss": 1.9392527341842651, + "eval_runtime": 20.2285, + "eval_samples_per_second": 2444.917, + "eval_steps_per_second": 9.59, + "step": 530000 + }, + { + "epoch": 0.5498323063238999, + "grad_norm": 1.7094584703445435, + "learning_rate": 5e-05, + "loss": 1.9535, + "step": 531000 + }, + { + "epoch": 0.5508677720608564, + "grad_norm": 1.5422221422195435, + "learning_rate": 5e-05, + "loss": 1.9615, + "step": 532000 + }, + { + "epoch": 0.5519032377978129, + "grad_norm": 2.052283525466919, + "learning_rate": 5e-05, + "loss": 1.9607, + "step": 533000 + }, + { + "epoch": 0.5529387035347694, + "grad_norm": 1.5253174304962158, + "learning_rate": 5e-05, + "loss": 1.948, + "step": 534000 + }, + { + "epoch": 0.5539741692717259, + "grad_norm": 1.4124022722244263, + "learning_rate": 5e-05, + "loss": 1.9532, + "step": 535000 + }, + { + "epoch": 0.5539741692717259, + "eval_loss": 1.9418219327926636, + "eval_runtime": 20.3658, + "eval_samples_per_second": 2428.43, + "eval_steps_per_second": 9.526, + "step": 535000 + }, + { + "epoch": 0.5550096350086824, + "grad_norm": 1.3456830978393555, + "learning_rate": 5e-05, + "loss": 1.9475, + "step": 536000 + }, + { + "epoch": 0.5560451007456388, + "grad_norm": 2.3639848232269287, + "learning_rate": 5e-05, + "loss": 1.9477, + "step": 537000 + }, + { + "epoch": 0.5570805664825954, + "grad_norm": 1.6593014001846313, + "learning_rate": 5e-05, + "loss": 1.9352, + "step": 538000 + }, + { + "epoch": 0.5581160322195519, + "grad_norm": 1.6242729425430298, + "learning_rate": 5e-05, + "loss": 1.9353, + "step": 539000 + }, + { + "epoch": 0.5591514979565083, + "grad_norm": 1.4847745895385742, + "learning_rate": 5e-05, + "loss": 1.9462, + "step": 540000 + }, + { + "epoch": 0.5591514979565083, + "eval_loss": 1.9415589570999146, + "eval_runtime": 20.4338, + "eval_samples_per_second": 2420.357, + "eval_steps_per_second": 9.494, + "step": 540000 + }, + { + "epoch": 0.5601869636934649, + "grad_norm": 1.6553374528884888, + "learning_rate": 5e-05, + "loss": 1.9401, + "step": 541000 + }, + { + "epoch": 0.5612224294304213, + "grad_norm": 1.6404752731323242, + "learning_rate": 5e-05, + "loss": 1.9523, + "step": 542000 + }, + { + "epoch": 0.5622578951673779, + "grad_norm": 1.7159552574157715, + "learning_rate": 5e-05, + "loss": 1.9538, + "step": 543000 + }, + { + "epoch": 0.5632933609043344, + "grad_norm": 1.8530490398406982, + "learning_rate": 5e-05, + "loss": 1.9471, + "step": 544000 + }, + { + "epoch": 0.5643288266412908, + "grad_norm": 1.7620545625686646, + "learning_rate": 5e-05, + "loss": 1.9563, + "step": 545000 + }, + { + "epoch": 0.5643288266412908, + "eval_loss": 1.9346576929092407, + "eval_runtime": 21.3714, + "eval_samples_per_second": 2314.163, + "eval_steps_per_second": 9.078, + "step": 545000 + }, + { + "epoch": 0.5653642923782474, + "grad_norm": 1.5687700510025024, + "learning_rate": 5e-05, + "loss": 1.9464, + "step": 546000 + }, + { + "epoch": 0.5663997581152038, + "grad_norm": 1.4959015846252441, + "learning_rate": 5e-05, + "loss": 1.9476, + "step": 547000 + }, + { + "epoch": 0.5674352238521604, + "grad_norm": 1.7213226556777954, + "learning_rate": 5e-05, + "loss": 1.9385, + "step": 548000 + }, + { + "epoch": 0.5684706895891168, + "grad_norm": 1.9467849731445312, + "learning_rate": 5e-05, + "loss": 1.9537, + "step": 549000 + }, + { + "epoch": 0.5695061553260733, + "grad_norm": 1.5239862203598022, + "learning_rate": 5e-05, + "loss": 1.953, + "step": 550000 + }, + { + "epoch": 0.5695061553260733, + "eval_loss": 1.9318312406539917, + "eval_runtime": 20.9119, + "eval_samples_per_second": 2365.015, + "eval_steps_per_second": 9.277, + "step": 550000 + }, + { + "epoch": 0.5705416210630299, + "grad_norm": 1.425758957862854, + "learning_rate": 5e-05, + "loss": 1.936, + "step": 551000 + }, + { + "epoch": 0.5715770867999863, + "grad_norm": 1.4828753471374512, + "learning_rate": 5e-05, + "loss": 1.9395, + "step": 552000 + }, + { + "epoch": 0.5726125525369429, + "grad_norm": 1.7051657438278198, + "learning_rate": 5e-05, + "loss": 1.9559, + "step": 553000 + }, + { + "epoch": 0.5736480182738993, + "grad_norm": 1.7012929916381836, + "learning_rate": 5e-05, + "loss": 1.9345, + "step": 554000 + }, + { + "epoch": 0.5746834840108558, + "grad_norm": 1.9521725177764893, + "learning_rate": 5e-05, + "loss": 1.9387, + "step": 555000 + }, + { + "epoch": 0.5746834840108558, + "eval_loss": 1.9329185485839844, + "eval_runtime": 22.9876, + "eval_samples_per_second": 2151.465, + "eval_steps_per_second": 8.439, + "step": 555000 + }, + { + "epoch": 0.5757189497478123, + "grad_norm": 1.610854148864746, + "learning_rate": 5e-05, + "loss": 1.9437, + "step": 556000 + }, + { + "epoch": 0.5767544154847688, + "grad_norm": 1.6509226560592651, + "learning_rate": 5e-05, + "loss": 1.9313, + "step": 557000 + }, + { + "epoch": 0.5777898812217254, + "grad_norm": 1.6432125568389893, + "learning_rate": 5e-05, + "loss": 1.9341, + "step": 558000 + }, + { + "epoch": 0.5788253469586818, + "grad_norm": 1.82942533493042, + "learning_rate": 5e-05, + "loss": 1.9427, + "step": 559000 + }, + { + "epoch": 0.5798608126956383, + "grad_norm": 1.574141263961792, + "learning_rate": 5e-05, + "loss": 1.9342, + "step": 560000 + }, + { + "epoch": 0.5798608126956383, + "eval_loss": 1.9234023094177246, + "eval_runtime": 20.6847, + "eval_samples_per_second": 2390.993, + "eval_steps_per_second": 9.379, + "step": 560000 + }, + { + "epoch": 0.5808962784325948, + "grad_norm": 1.1034334897994995, + "learning_rate": 5e-05, + "loss": 1.9385, + "step": 561000 + }, + { + "epoch": 0.5819317441695513, + "grad_norm": 1.800497055053711, + "learning_rate": 5e-05, + "loss": 1.945, + "step": 562000 + }, + { + "epoch": 0.5829672099065077, + "grad_norm": 1.6229628324508667, + "learning_rate": 5e-05, + "loss": 1.9357, + "step": 563000 + }, + { + "epoch": 0.5840026756434643, + "grad_norm": 1.500807762145996, + "learning_rate": 5e-05, + "loss": 1.9461, + "step": 564000 + }, + { + "epoch": 0.5850381413804208, + "grad_norm": 1.3186248540878296, + "learning_rate": 5e-05, + "loss": 1.9359, + "step": 565000 + }, + { + "epoch": 0.5850381413804208, + "eval_loss": 1.9301375150680542, + "eval_runtime": 21.6317, + "eval_samples_per_second": 2286.315, + "eval_steps_per_second": 8.968, + "step": 565000 + }, + { + "epoch": 0.5860736071173773, + "grad_norm": 1.6370793581008911, + "learning_rate": 5e-05, + "loss": 1.9336, + "step": 566000 + }, + { + "epoch": 0.5871090728543338, + "grad_norm": 1.455642819404602, + "learning_rate": 5e-05, + "loss": 1.935, + "step": 567000 + }, + { + "epoch": 0.5881445385912902, + "grad_norm": 1.7208240032196045, + "learning_rate": 5e-05, + "loss": 1.9391, + "step": 568000 + }, + { + "epoch": 0.5891800043282468, + "grad_norm": 1.6520531177520752, + "learning_rate": 5e-05, + "loss": 1.9313, + "step": 569000 + }, + { + "epoch": 0.5902154700652033, + "grad_norm": 1.641925573348999, + "learning_rate": 5e-05, + "loss": 1.941, + "step": 570000 + }, + { + "epoch": 0.5902154700652033, + "eval_loss": 1.924635887145996, + "eval_runtime": 20.8475, + "eval_samples_per_second": 2372.317, + "eval_steps_per_second": 9.306, + "step": 570000 + }, + { + "epoch": 0.5912509358021598, + "grad_norm": 1.756401538848877, + "learning_rate": 5e-05, + "loss": 1.9198, + "step": 571000 + }, + { + "epoch": 0.5922864015391163, + "grad_norm": 1.855057716369629, + "learning_rate": 5e-05, + "loss": 1.9265, + "step": 572000 + }, + { + "epoch": 0.5933218672760727, + "grad_norm": 1.4401320219039917, + "learning_rate": 5e-05, + "loss": 1.9253, + "step": 573000 + }, + { + "epoch": 0.5943573330130293, + "grad_norm": 1.8701120615005493, + "learning_rate": 5e-05, + "loss": 1.9371, + "step": 574000 + }, + { + "epoch": 0.5953927987499857, + "grad_norm": 1.3622015714645386, + "learning_rate": 5e-05, + "loss": 1.9255, + "step": 575000 + }, + { + "epoch": 0.5953927987499857, + "eval_loss": 1.9278358221054077, + "eval_runtime": 20.619, + "eval_samples_per_second": 2398.61, + "eval_steps_per_second": 9.409, + "step": 575000 + }, + { + "epoch": 0.5964282644869423, + "grad_norm": 1.423957109451294, + "learning_rate": 5e-05, + "loss": 1.9304, + "step": 576000 + }, + { + "epoch": 0.5974637302238988, + "grad_norm": 1.4679838418960571, + "learning_rate": 5e-05, + "loss": 1.9356, + "step": 577000 + }, + { + "epoch": 0.5984991959608552, + "grad_norm": 1.7475810050964355, + "learning_rate": 5e-05, + "loss": 1.9383, + "step": 578000 + }, + { + "epoch": 0.5995346616978118, + "grad_norm": 1.4328486919403076, + "learning_rate": 5e-05, + "loss": 1.9211, + "step": 579000 + }, + { + "epoch": 0.6005701274347682, + "grad_norm": 1.367355465888977, + "learning_rate": 5e-05, + "loss": 1.928, + "step": 580000 + }, + { + "epoch": 0.6005701274347682, + "eval_loss": 1.926809310913086, + "eval_runtime": 21.2573, + "eval_samples_per_second": 2326.589, + "eval_steps_per_second": 9.126, + "step": 580000 + }, + { + "epoch": 0.6016055931717248, + "grad_norm": 1.7061880826950073, + "learning_rate": 5e-05, + "loss": 1.927, + "step": 581000 + }, + { + "epoch": 0.6026410589086812, + "grad_norm": 1.2841322422027588, + "learning_rate": 5e-05, + "loss": 1.9244, + "step": 582000 + }, + { + "epoch": 0.6036765246456377, + "grad_norm": 1.597072958946228, + "learning_rate": 5e-05, + "loss": 1.9297, + "step": 583000 + }, + { + "epoch": 0.6047119903825943, + "grad_norm": 1.5782889127731323, + "learning_rate": 5e-05, + "loss": 1.9293, + "step": 584000 + }, + { + "epoch": 0.6057474561195507, + "grad_norm": 1.709469199180603, + "learning_rate": 5e-05, + "loss": 1.9275, + "step": 585000 + }, + { + "epoch": 0.6057474561195507, + "eval_loss": 1.9254355430603027, + "eval_runtime": 20.5016, + "eval_samples_per_second": 2412.346, + "eval_steps_per_second": 9.463, + "step": 585000 + }, + { + "epoch": 0.6067829218565072, + "grad_norm": 1.491125226020813, + "learning_rate": 5e-05, + "loss": 1.9303, + "step": 586000 + }, + { + "epoch": 0.6078183875934637, + "grad_norm": 1.8089282512664795, + "learning_rate": 5e-05, + "loss": 1.9119, + "step": 587000 + }, + { + "epoch": 0.6088538533304202, + "grad_norm": 1.6499803066253662, + "learning_rate": 5e-05, + "loss": 1.9297, + "step": 588000 + }, + { + "epoch": 0.6098893190673768, + "grad_norm": 1.563252568244934, + "learning_rate": 5e-05, + "loss": 1.9225, + "step": 589000 + }, + { + "epoch": 0.6109247848043332, + "grad_norm": 1.752363681793213, + "learning_rate": 5e-05, + "loss": 1.9248, + "step": 590000 + }, + { + "epoch": 0.6109247848043332, + "eval_loss": 1.9221693277359009, + "eval_runtime": 21.0482, + "eval_samples_per_second": 2349.701, + "eval_steps_per_second": 9.217, + "step": 590000 + }, + { + "epoch": 0.6119602505412897, + "grad_norm": 1.770412802696228, + "learning_rate": 5e-05, + "loss": 1.9174, + "step": 591000 + }, + { + "epoch": 0.6129957162782462, + "grad_norm": 1.7694320678710938, + "learning_rate": 5e-05, + "loss": 1.932, + "step": 592000 + }, + { + "epoch": 0.6140311820152027, + "grad_norm": 1.4580289125442505, + "learning_rate": 5e-05, + "loss": 1.935, + "step": 593000 + }, + { + "epoch": 0.6150666477521592, + "grad_norm": 1.39983332157135, + "learning_rate": 5e-05, + "loss": 1.9131, + "step": 594000 + }, + { + "epoch": 0.6161021134891157, + "grad_norm": 1.8075814247131348, + "learning_rate": 5e-05, + "loss": 1.9265, + "step": 595000 + }, + { + "epoch": 0.6161021134891157, + "eval_loss": 1.9219050407409668, + "eval_runtime": 20.8447, + "eval_samples_per_second": 2372.645, + "eval_steps_per_second": 9.307, + "step": 595000 + }, + { + "epoch": 0.6171375792260722, + "grad_norm": 1.6267366409301758, + "learning_rate": 5e-05, + "loss": 1.9316, + "step": 596000 + }, + { + "epoch": 0.6181730449630287, + "grad_norm": 1.6033700704574585, + "learning_rate": 5e-05, + "loss": 1.92, + "step": 597000 + }, + { + "epoch": 0.6192085106999852, + "grad_norm": 1.993261456489563, + "learning_rate": 5e-05, + "loss": 1.9246, + "step": 598000 + }, + { + "epoch": 0.6202439764369417, + "grad_norm": 1.785793423652649, + "learning_rate": 5e-05, + "loss": 1.9264, + "step": 599000 + }, + { + "epoch": 0.6212794421738982, + "grad_norm": 1.7126010656356812, + "learning_rate": 5e-05, + "loss": 1.9228, + "step": 600000 + }, + { + "epoch": 0.6212794421738982, + "eval_loss": 1.910009741783142, + "eval_runtime": 21.3491, + "eval_samples_per_second": 2316.58, + "eval_steps_per_second": 9.087, + "step": 600000 + }, + { + "epoch": 0.6223149079108546, + "grad_norm": 1.6621829271316528, + "learning_rate": 5e-05, + "loss": 1.9381, + "step": 601000 + }, + { + "epoch": 0.6233503736478112, + "grad_norm": 2.010495185852051, + "learning_rate": 5e-05, + "loss": 1.9177, + "step": 602000 + }, + { + "epoch": 0.6243858393847677, + "grad_norm": 1.4604156017303467, + "learning_rate": 5e-05, + "loss": 1.9163, + "step": 603000 + }, + { + "epoch": 0.6254213051217242, + "grad_norm": 1.5160646438598633, + "learning_rate": 5e-05, + "loss": 1.9309, + "step": 604000 + }, + { + "epoch": 0.6264567708586807, + "grad_norm": 1.889029860496521, + "learning_rate": 5e-05, + "loss": 1.9245, + "step": 605000 + }, + { + "epoch": 0.6264567708586807, + "eval_loss": 1.9162534475326538, + "eval_runtime": 20.621, + "eval_samples_per_second": 2398.382, + "eval_steps_per_second": 9.408, + "step": 605000 + }, + { + "epoch": 0.6274922365956371, + "grad_norm": 1.5529805421829224, + "learning_rate": 5e-05, + "loss": 1.9274, + "step": 606000 + }, + { + "epoch": 0.6285277023325937, + "grad_norm": 1.7846838235855103, + "learning_rate": 5e-05, + "loss": 1.9304, + "step": 607000 + }, + { + "epoch": 0.6295631680695501, + "grad_norm": 1.9997135400772095, + "learning_rate": 5e-05, + "loss": 1.9235, + "step": 608000 + }, + { + "epoch": 0.6305986338065067, + "grad_norm": 1.391270637512207, + "learning_rate": 5e-05, + "loss": 1.9193, + "step": 609000 + }, + { + "epoch": 0.6316340995434632, + "grad_norm": 1.8219945430755615, + "learning_rate": 5e-05, + "loss": 1.9121, + "step": 610000 + }, + { + "epoch": 0.6316340995434632, + "eval_loss": 1.9175958633422852, + "eval_runtime": 21.497, + "eval_samples_per_second": 2300.643, + "eval_steps_per_second": 9.025, + "step": 610000 + }, + { + "epoch": 0.6326695652804196, + "grad_norm": 1.5030641555786133, + "learning_rate": 5e-05, + "loss": 1.9238, + "step": 611000 + }, + { + "epoch": 0.6337050310173762, + "grad_norm": 1.6022168397903442, + "learning_rate": 5e-05, + "loss": 1.9356, + "step": 612000 + }, + { + "epoch": 0.6347404967543326, + "grad_norm": 1.4753010272979736, + "learning_rate": 5e-05, + "loss": 1.9258, + "step": 613000 + }, + { + "epoch": 0.6357759624912891, + "grad_norm": 1.8194416761398315, + "learning_rate": 5e-05, + "loss": 1.9158, + "step": 614000 + }, + { + "epoch": 0.6368114282282457, + "grad_norm": 1.3283801078796387, + "learning_rate": 5e-05, + "loss": 1.909, + "step": 615000 + }, + { + "epoch": 0.6368114282282457, + "eval_loss": 1.9115009307861328, + "eval_runtime": 20.6331, + "eval_samples_per_second": 2396.974, + "eval_steps_per_second": 9.402, + "step": 615000 + }, + { + "epoch": 0.6378468939652021, + "grad_norm": 1.2602663040161133, + "learning_rate": 5e-05, + "loss": 1.9231, + "step": 616000 + }, + { + "epoch": 0.6388823597021587, + "grad_norm": 1.8703596591949463, + "learning_rate": 5e-05, + "loss": 1.9159, + "step": 617000 + }, + { + "epoch": 0.6399178254391151, + "grad_norm": 1.7976280450820923, + "learning_rate": 5e-05, + "loss": 1.9194, + "step": 618000 + }, + { + "epoch": 0.6409532911760716, + "grad_norm": 1.4662623405456543, + "learning_rate": 5e-05, + "loss": 1.912, + "step": 619000 + }, + { + "epoch": 0.6419887569130281, + "grad_norm": 1.8969612121582031, + "learning_rate": 5e-05, + "loss": 1.9232, + "step": 620000 + }, + { + "epoch": 0.6419887569130281, + "eval_loss": 1.9057178497314453, + "eval_runtime": 21.3901, + "eval_samples_per_second": 2312.146, + "eval_steps_per_second": 9.07, + "step": 620000 + }, + { + "epoch": 0.6430242226499846, + "grad_norm": 1.6290833950042725, + "learning_rate": 5e-05, + "loss": 1.9103, + "step": 621000 + }, + { + "epoch": 0.6440596883869412, + "grad_norm": 1.7869369983673096, + "learning_rate": 5e-05, + "loss": 1.9181, + "step": 622000 + }, + { + "epoch": 0.6450951541238976, + "grad_norm": 1.555368185043335, + "learning_rate": 5e-05, + "loss": 1.9203, + "step": 623000 + }, + { + "epoch": 0.6461306198608541, + "grad_norm": 1.8794424533843994, + "learning_rate": 5e-05, + "loss": 1.918, + "step": 624000 + }, + { + "epoch": 0.6471660855978106, + "grad_norm": 1.838217854499817, + "learning_rate": 5e-05, + "loss": 1.9268, + "step": 625000 + }, + { + "epoch": 0.6471660855978106, + "eval_loss": 1.9147250652313232, + "eval_runtime": 20.7607, + "eval_samples_per_second": 2382.241, + "eval_steps_per_second": 9.345, + "step": 625000 + }, + { + "epoch": 0.6482015513347671, + "grad_norm": 1.9131754636764526, + "learning_rate": 5e-05, + "loss": 1.913, + "step": 626000 + }, + { + "epoch": 0.6492370170717237, + "grad_norm": 1.3449556827545166, + "learning_rate": 5e-05, + "loss": 1.9134, + "step": 627000 + }, + { + "epoch": 0.6502724828086801, + "grad_norm": 1.7116332054138184, + "learning_rate": 5e-05, + "loss": 1.92, + "step": 628000 + }, + { + "epoch": 0.6513079485456366, + "grad_norm": 1.9661062955856323, + "learning_rate": 5e-05, + "loss": 1.9181, + "step": 629000 + }, + { + "epoch": 0.6523434142825931, + "grad_norm": 2.0663058757781982, + "learning_rate": 5e-05, + "loss": 1.9209, + "step": 630000 + }, + { + "epoch": 0.6523434142825931, + "eval_loss": 1.9034740924835205, + "eval_runtime": 21.31, + "eval_samples_per_second": 2320.84, + "eval_steps_per_second": 9.104, + "step": 630000 + }, + { + "epoch": 0.6533788800195496, + "grad_norm": 1.7314753532409668, + "learning_rate": 5e-05, + "loss": 1.9258, + "step": 631000 + }, + { + "epoch": 0.6544143457565061, + "grad_norm": 1.4399973154067993, + "learning_rate": 5e-05, + "loss": 1.9185, + "step": 632000 + }, + { + "epoch": 0.6554498114934626, + "grad_norm": 1.7365604639053345, + "learning_rate": 5e-05, + "loss": 1.9213, + "step": 633000 + }, + { + "epoch": 0.656485277230419, + "grad_norm": 1.6954264640808105, + "learning_rate": 5e-05, + "loss": 1.9203, + "step": 634000 + }, + { + "epoch": 0.6575207429673756, + "grad_norm": 1.5337830781936646, + "learning_rate": 5e-05, + "loss": 1.9206, + "step": 635000 + }, + { + "epoch": 0.6575207429673756, + "eval_loss": 1.8993173837661743, + "eval_runtime": 20.4063, + "eval_samples_per_second": 2423.615, + "eval_steps_per_second": 9.507, + "step": 635000 + }, + { + "epoch": 0.6585562087043321, + "grad_norm": 1.5964773893356323, + "learning_rate": 5e-05, + "loss": 1.9141, + "step": 636000 + }, + { + "epoch": 0.6595916744412885, + "grad_norm": 1.443393588066101, + "learning_rate": 5e-05, + "loss": 1.9231, + "step": 637000 + }, + { + "epoch": 0.6606271401782451, + "grad_norm": 1.5183662176132202, + "learning_rate": 5e-05, + "loss": 1.9169, + "step": 638000 + }, + { + "epoch": 0.6616626059152015, + "grad_norm": 1.3888758420944214, + "learning_rate": 5e-05, + "loss": 1.9232, + "step": 639000 + }, + { + "epoch": 0.6626980716521581, + "grad_norm": 2.3498077392578125, + "learning_rate": 5e-05, + "loss": 1.9112, + "step": 640000 + }, + { + "epoch": 0.6626980716521581, + "eval_loss": 1.9156203269958496, + "eval_runtime": 21.2557, + "eval_samples_per_second": 2326.767, + "eval_steps_per_second": 9.127, + "step": 640000 + }, + { + "epoch": 0.6637335373891146, + "grad_norm": 1.3806946277618408, + "learning_rate": 5e-05, + "loss": 1.9239, + "step": 641000 + }, + { + "epoch": 0.664769003126071, + "grad_norm": 1.379116177558899, + "learning_rate": 5e-05, + "loss": 1.9193, + "step": 642000 + }, + { + "epoch": 0.6658044688630276, + "grad_norm": 1.5152908563613892, + "learning_rate": 5e-05, + "loss": 1.9157, + "step": 643000 + }, + { + "epoch": 0.666839934599984, + "grad_norm": 1.9281625747680664, + "learning_rate": 5e-05, + "loss": 1.9223, + "step": 644000 + }, + { + "epoch": 0.6678754003369406, + "grad_norm": 1.6093673706054688, + "learning_rate": 5e-05, + "loss": 1.9197, + "step": 645000 + }, + { + "epoch": 0.6678754003369406, + "eval_loss": 1.9053946733474731, + "eval_runtime": 20.5485, + "eval_samples_per_second": 2406.846, + "eval_steps_per_second": 9.441, + "step": 645000 + }, + { + "epoch": 0.668910866073897, + "grad_norm": 1.7991442680358887, + "learning_rate": 5e-05, + "loss": 1.9208, + "step": 646000 + }, + { + "epoch": 0.6699463318108535, + "grad_norm": 1.7188022136688232, + "learning_rate": 5e-05, + "loss": 1.9274, + "step": 647000 + }, + { + "epoch": 0.6709817975478101, + "grad_norm": 1.4109748601913452, + "learning_rate": 5e-05, + "loss": 1.9107, + "step": 648000 + }, + { + "epoch": 0.6720172632847665, + "grad_norm": 1.8776260614395142, + "learning_rate": 5e-05, + "loss": 1.9152, + "step": 649000 + }, + { + "epoch": 0.6730527290217231, + "grad_norm": 1.761953592300415, + "learning_rate": 5e-05, + "loss": 1.9088, + "step": 650000 + }, + { + "epoch": 0.6730527290217231, + "eval_loss": 1.902655005455017, + "eval_runtime": 21.2187, + "eval_samples_per_second": 2330.819, + "eval_steps_per_second": 9.143, + "step": 650000 + }, + { + "epoch": 0.6740881947586795, + "grad_norm": 1.4429457187652588, + "learning_rate": 5e-05, + "loss": 1.913, + "step": 651000 + }, + { + "epoch": 0.675123660495636, + "grad_norm": 1.3650360107421875, + "learning_rate": 5e-05, + "loss": 1.9099, + "step": 652000 + }, + { + "epoch": 0.6761591262325926, + "grad_norm": 1.3658324480056763, + "learning_rate": 5e-05, + "loss": 1.9249, + "step": 653000 + }, + { + "epoch": 0.677194591969549, + "grad_norm": 2.063399076461792, + "learning_rate": 5e-05, + "loss": 1.9176, + "step": 654000 + }, + { + "epoch": 0.6782300577065056, + "grad_norm": 1.9073212146759033, + "learning_rate": 5e-05, + "loss": 1.9055, + "step": 655000 + }, + { + "epoch": 0.6782300577065056, + "eval_loss": 1.9057023525238037, + "eval_runtime": 20.7336, + "eval_samples_per_second": 2385.35, + "eval_steps_per_second": 9.357, + "step": 655000 + }, + { + "epoch": 0.679265523443462, + "grad_norm": 1.8441895246505737, + "learning_rate": 5e-05, + "loss": 1.9204, + "step": 656000 + }, + { + "epoch": 0.6803009891804185, + "grad_norm": 1.539784550666809, + "learning_rate": 5e-05, + "loss": 1.9236, + "step": 657000 + }, + { + "epoch": 0.681336454917375, + "grad_norm": 1.7579704523086548, + "learning_rate": 5e-05, + "loss": 1.8967, + "step": 658000 + }, + { + "epoch": 0.6823719206543315, + "grad_norm": 1.5053812265396118, + "learning_rate": 5e-05, + "loss": 1.9077, + "step": 659000 + }, + { + "epoch": 0.683407386391288, + "grad_norm": 1.3976589441299438, + "learning_rate": 5e-05, + "loss": 1.9107, + "step": 660000 + }, + { + "epoch": 0.683407386391288, + "eval_loss": 1.9005506038665771, + "eval_runtime": 21.2622, + "eval_samples_per_second": 2326.058, + "eval_steps_per_second": 9.124, + "step": 660000 + }, + { + "epoch": 0.6844428521282445, + "grad_norm": 1.442474603652954, + "learning_rate": 5e-05, + "loss": 1.8983, + "step": 661000 + }, + { + "epoch": 0.685478317865201, + "grad_norm": 1.574547290802002, + "learning_rate": 5e-05, + "loss": 1.9212, + "step": 662000 + }, + { + "epoch": 0.6865137836021575, + "grad_norm": 1.293898105621338, + "learning_rate": 5e-05, + "loss": 1.9139, + "step": 663000 + }, + { + "epoch": 0.687549249339114, + "grad_norm": 1.7832250595092773, + "learning_rate": 5e-05, + "loss": 1.9123, + "step": 664000 + }, + { + "epoch": 0.6885847150760704, + "grad_norm": 1.4399775266647339, + "learning_rate": 5e-05, + "loss": 1.9073, + "step": 665000 + }, + { + "epoch": 0.6885847150760704, + "eval_loss": 1.895343542098999, + "eval_runtime": 20.1859, + "eval_samples_per_second": 2450.071, + "eval_steps_per_second": 9.611, + "step": 665000 + }, + { + "epoch": 0.689620180813027, + "grad_norm": 1.6234581470489502, + "learning_rate": 5e-05, + "loss": 1.9009, + "step": 666000 + }, + { + "epoch": 0.6906556465499835, + "grad_norm": 1.2184017896652222, + "learning_rate": 5e-05, + "loss": 1.9023, + "step": 667000 + }, + { + "epoch": 0.69169111228694, + "grad_norm": 1.973521113395691, + "learning_rate": 5e-05, + "loss": 1.8883, + "step": 668000 + }, + { + "epoch": 0.6927265780238965, + "grad_norm": 1.455173373222351, + "learning_rate": 5e-05, + "loss": 1.9057, + "step": 669000 + }, + { + "epoch": 0.6937620437608529, + "grad_norm": 1.7372217178344727, + "learning_rate": 5e-05, + "loss": 1.9045, + "step": 670000 + }, + { + "epoch": 0.6937620437608529, + "eval_loss": 1.8927185535430908, + "eval_runtime": 21.4642, + "eval_samples_per_second": 2304.16, + "eval_steps_per_second": 9.038, + "step": 670000 + }, + { + "epoch": 0.6947975094978095, + "grad_norm": 1.8958580493927002, + "learning_rate": 5e-05, + "loss": 1.897, + "step": 671000 + }, + { + "epoch": 0.695832975234766, + "grad_norm": 1.4989745616912842, + "learning_rate": 5e-05, + "loss": 1.9102, + "step": 672000 + }, + { + "epoch": 0.6968684409717225, + "grad_norm": 1.7313189506530762, + "learning_rate": 5e-05, + "loss": 1.9097, + "step": 673000 + }, + { + "epoch": 0.697903906708679, + "grad_norm": 1.197485327720642, + "learning_rate": 5e-05, + "loss": 1.8999, + "step": 674000 + }, + { + "epoch": 0.6989393724456354, + "grad_norm": 1.484868049621582, + "learning_rate": 5e-05, + "loss": 1.9096, + "step": 675000 + }, + { + "epoch": 0.6989393724456354, + "eval_loss": 1.897947907447815, + "eval_runtime": 21.0415, + "eval_samples_per_second": 2350.453, + "eval_steps_per_second": 9.22, + "step": 675000 + }, + { + "epoch": 0.699974838182592, + "grad_norm": 1.7760064601898193, + "learning_rate": 5e-05, + "loss": 1.9048, + "step": 676000 + }, + { + "epoch": 0.7010103039195484, + "grad_norm": 2.166504383087158, + "learning_rate": 5e-05, + "loss": 1.9077, + "step": 677000 + }, + { + "epoch": 0.702045769656505, + "grad_norm": 1.9204415082931519, + "learning_rate": 5e-05, + "loss": 1.9092, + "step": 678000 + }, + { + "epoch": 0.7030812353934615, + "grad_norm": 1.7995051145553589, + "learning_rate": 5e-05, + "loss": 1.912, + "step": 679000 + }, + { + "epoch": 0.7041167011304179, + "grad_norm": 1.947636604309082, + "learning_rate": 5e-05, + "loss": 1.9136, + "step": 680000 + }, + { + "epoch": 0.7041167011304179, + "eval_loss": 1.896292805671692, + "eval_runtime": 20.6404, + "eval_samples_per_second": 2396.129, + "eval_steps_per_second": 9.399, + "step": 680000 + }, + { + "epoch": 0.7051521668673745, + "grad_norm": 1.4779621362686157, + "learning_rate": 5e-05, + "loss": 1.9079, + "step": 681000 + }, + { + "epoch": 0.7061876326043309, + "grad_norm": 1.6015859842300415, + "learning_rate": 5e-05, + "loss": 1.9054, + "step": 682000 + }, + { + "epoch": 0.7072230983412874, + "grad_norm": 1.571121335029602, + "learning_rate": 5e-05, + "loss": 1.9174, + "step": 683000 + }, + { + "epoch": 0.708258564078244, + "grad_norm": 1.3641341924667358, + "learning_rate": 5e-05, + "loss": 1.9003, + "step": 684000 + }, + { + "epoch": 0.7092940298152004, + "grad_norm": 1.9284065961837769, + "learning_rate": 5e-05, + "loss": 1.9146, + "step": 685000 + }, + { + "epoch": 0.7092940298152004, + "eval_loss": 1.892472743988037, + "eval_runtime": 23.451, + "eval_samples_per_second": 2108.955, + "eval_steps_per_second": 8.273, + "step": 685000 + }, + { + "epoch": 0.710329495552157, + "grad_norm": 1.8129565715789795, + "learning_rate": 5e-05, + "loss": 1.912, + "step": 686000 + }, + { + "epoch": 0.7113649612891134, + "grad_norm": 1.6975070238113403, + "learning_rate": 5e-05, + "loss": 1.8925, + "step": 687000 + }, + { + "epoch": 0.7124004270260699, + "grad_norm": 1.4034419059753418, + "learning_rate": 5e-05, + "loss": 1.9091, + "step": 688000 + }, + { + "epoch": 0.7134358927630264, + "grad_norm": 2.147268056869507, + "learning_rate": 5e-05, + "loss": 1.9041, + "step": 689000 + }, + { + "epoch": 0.7144713584999829, + "grad_norm": 2.014146327972412, + "learning_rate": 5e-05, + "loss": 1.8952, + "step": 690000 + }, + { + "epoch": 0.7144713584999829, + "eval_loss": 1.9015657901763916, + "eval_runtime": 18.7155, + "eval_samples_per_second": 2642.575, + "eval_steps_per_second": 10.366, + "step": 690000 + }, + { + "epoch": 0.7155068242369395, + "grad_norm": 1.773491621017456, + "learning_rate": 5e-05, + "loss": 1.9114, + "step": 691000 + }, + { + "epoch": 0.7165422899738959, + "grad_norm": 1.6221060752868652, + "learning_rate": 5e-05, + "loss": 1.9072, + "step": 692000 + }, + { + "epoch": 0.7175777557108524, + "grad_norm": 2.0108203887939453, + "learning_rate": 5e-05, + "loss": 1.8966, + "step": 693000 + }, + { + "epoch": 0.7186132214478089, + "grad_norm": 1.7623751163482666, + "learning_rate": 5e-05, + "loss": 1.8931, + "step": 694000 + }, + { + "epoch": 0.7196486871847654, + "grad_norm": 1.6833388805389404, + "learning_rate": 5e-05, + "loss": 1.9088, + "step": 695000 + }, + { + "epoch": 0.7196486871847654, + "eval_loss": 1.8960736989974976, + "eval_runtime": 20.0395, + "eval_samples_per_second": 2467.97, + "eval_steps_per_second": 9.681, + "step": 695000 + }, + { + "epoch": 0.7206841529217219, + "grad_norm": 1.7570165395736694, + "learning_rate": 5e-05, + "loss": 1.8961, + "step": 696000 + }, + { + "epoch": 0.7217196186586784, + "grad_norm": 1.8349229097366333, + "learning_rate": 5e-05, + "loss": 1.9142, + "step": 697000 + }, + { + "epoch": 0.7227550843956349, + "grad_norm": 1.9120980501174927, + "learning_rate": 5e-05, + "loss": 1.9123, + "step": 698000 + }, + { + "epoch": 0.7237905501325914, + "grad_norm": 1.4346221685409546, + "learning_rate": 5e-05, + "loss": 1.9, + "step": 699000 + }, + { + "epoch": 0.7248260158695479, + "grad_norm": 1.5059664249420166, + "learning_rate": 5e-05, + "loss": 1.9088, + "step": 700000 + }, + { + "epoch": 0.7248260158695479, + "eval_loss": 1.8930763006210327, + "eval_runtime": 20.3037, + "eval_samples_per_second": 2435.865, + "eval_steps_per_second": 9.555, + "step": 700000 + }, + { + "epoch": 0.7258614816065044, + "grad_norm": 1.4954737424850464, + "learning_rate": 5e-05, + "loss": 1.8894, + "step": 701000 + }, + { + "epoch": 0.7268969473434609, + "grad_norm": 2.206475257873535, + "learning_rate": 5e-05, + "loss": 1.897, + "step": 702000 + }, + { + "epoch": 0.7279324130804173, + "grad_norm": 1.39924156665802, + "learning_rate": 5e-05, + "loss": 1.9081, + "step": 703000 + }, + { + "epoch": 0.7289678788173739, + "grad_norm": 1.6110947132110596, + "learning_rate": 5e-05, + "loss": 1.8973, + "step": 704000 + }, + { + "epoch": 0.7300033445543304, + "grad_norm": 1.7667462825775146, + "learning_rate": 5e-05, + "loss": 1.893, + "step": 705000 + }, + { + "epoch": 0.7300033445543304, + "eval_loss": 1.8968194723129272, + "eval_runtime": 21.1571, + "eval_samples_per_second": 2337.611, + "eval_steps_per_second": 9.17, + "step": 705000 + }, + { + "epoch": 0.7310388102912868, + "grad_norm": 1.4846593141555786, + "learning_rate": 5e-05, + "loss": 1.8858, + "step": 706000 + }, + { + "epoch": 0.7320742760282434, + "grad_norm": 1.5730334520339966, + "learning_rate": 5e-05, + "loss": 1.902, + "step": 707000 + }, + { + "epoch": 0.7331097417651998, + "grad_norm": 2.5404248237609863, + "learning_rate": 5e-05, + "loss": 1.8896, + "step": 708000 + }, + { + "epoch": 0.7341452075021564, + "grad_norm": 2.4605815410614014, + "learning_rate": 5e-05, + "loss": 1.9059, + "step": 709000 + }, + { + "epoch": 0.7351806732391128, + "grad_norm": 1.3232377767562866, + "learning_rate": 5e-05, + "loss": 1.9015, + "step": 710000 + }, + { + "epoch": 0.7351806732391128, + "eval_loss": 1.8934879302978516, + "eval_runtime": 21.4904, + "eval_samples_per_second": 2301.348, + "eval_steps_per_second": 9.027, + "step": 710000 + }, + { + "epoch": 0.7362161389760693, + "grad_norm": 1.9923709630966187, + "learning_rate": 5e-05, + "loss": 1.8914, + "step": 711000 + }, + { + "epoch": 0.7372516047130259, + "grad_norm": 1.58742094039917, + "learning_rate": 5e-05, + "loss": 1.913, + "step": 712000 + }, + { + "epoch": 0.7382870704499823, + "grad_norm": 1.5676469802856445, + "learning_rate": 5e-05, + "loss": 1.8864, + "step": 713000 + }, + { + "epoch": 0.7393225361869389, + "grad_norm": 1.5205984115600586, + "learning_rate": 5e-05, + "loss": 1.8859, + "step": 714000 + }, + { + "epoch": 0.7403580019238953, + "grad_norm": 1.9802634716033936, + "learning_rate": 5e-05, + "loss": 1.8972, + "step": 715000 + }, + { + "epoch": 0.7403580019238953, + "eval_loss": 1.8818447589874268, + "eval_runtime": 32.1303, + "eval_samples_per_second": 1539.266, + "eval_steps_per_second": 6.038, + "step": 715000 + }, + { + "epoch": 0.7413934676608518, + "grad_norm": 1.7090638875961304, + "learning_rate": 5e-05, + "loss": 1.8931, + "step": 716000 + }, + { + "epoch": 0.7424289333978084, + "grad_norm": 1.3374643325805664, + "learning_rate": 5e-05, + "loss": 1.9076, + "step": 717000 + }, + { + "epoch": 0.7434643991347648, + "grad_norm": 1.7601673603057861, + "learning_rate": 5e-05, + "loss": 1.9031, + "step": 718000 + }, + { + "epoch": 0.7444998648717214, + "grad_norm": 1.3686988353729248, + "learning_rate": 5e-05, + "loss": 1.8969, + "step": 719000 + }, + { + "epoch": 0.7455353306086778, + "grad_norm": 1.6470602750778198, + "learning_rate": 5e-05, + "loss": 1.8986, + "step": 720000 + }, + { + "epoch": 0.7455353306086778, + "eval_loss": 1.8832844495773315, + "eval_runtime": 20.1206, + "eval_samples_per_second": 2458.028, + "eval_steps_per_second": 9.642, + "step": 720000 + }, + { + "epoch": 0.7465707963456343, + "grad_norm": 1.6907163858413696, + "learning_rate": 5e-05, + "loss": 1.8998, + "step": 721000 + }, + { + "epoch": 0.7476062620825908, + "grad_norm": 1.5861003398895264, + "learning_rate": 5e-05, + "loss": 1.9007, + "step": 722000 + }, + { + "epoch": 0.7486417278195473, + "grad_norm": 1.5830234289169312, + "learning_rate": 5e-05, + "loss": 1.8931, + "step": 723000 + }, + { + "epoch": 0.7496771935565039, + "grad_norm": 1.778434157371521, + "learning_rate": 5e-05, + "loss": 1.9, + "step": 724000 + }, + { + "epoch": 0.7507126592934603, + "grad_norm": 1.8183609247207642, + "learning_rate": 5e-05, + "loss": 1.8913, + "step": 725000 + }, + { + "epoch": 0.7507126592934603, + "eval_loss": 1.8839964866638184, + "eval_runtime": 24.8386, + "eval_samples_per_second": 1991.138, + "eval_steps_per_second": 7.81, + "step": 725000 + }, + { + "epoch": 0.7517481250304168, + "grad_norm": 1.8275715112686157, + "learning_rate": 5e-05, + "loss": 1.8965, + "step": 726000 + }, + { + "epoch": 0.7527835907673733, + "grad_norm": 1.6312036514282227, + "learning_rate": 5e-05, + "loss": 1.8781, + "step": 727000 + }, + { + "epoch": 0.7538190565043298, + "grad_norm": 1.813920259475708, + "learning_rate": 5e-05, + "loss": 1.8957, + "step": 728000 + }, + { + "epoch": 0.7548545222412864, + "grad_norm": 1.5408565998077393, + "learning_rate": 5e-05, + "loss": 1.8962, + "step": 729000 + }, + { + "epoch": 0.7558899879782428, + "grad_norm": 1.43170964717865, + "learning_rate": 5e-05, + "loss": 1.9008, + "step": 730000 + }, + { + "epoch": 0.7558899879782428, + "eval_loss": 1.8895530700683594, + "eval_runtime": 23.6536, + "eval_samples_per_second": 2090.887, + "eval_steps_per_second": 8.202, + "step": 730000 + }, + { + "epoch": 0.7569254537151993, + "grad_norm": 1.4100663661956787, + "learning_rate": 5e-05, + "loss": 1.8896, + "step": 731000 + }, + { + "epoch": 0.7579609194521558, + "grad_norm": 1.88717782497406, + "learning_rate": 5e-05, + "loss": 1.897, + "step": 732000 + }, + { + "epoch": 0.7589963851891123, + "grad_norm": 1.6897785663604736, + "learning_rate": 5e-05, + "loss": 1.8866, + "step": 733000 + }, + { + "epoch": 0.7600318509260687, + "grad_norm": 1.6449230909347534, + "learning_rate": 5e-05, + "loss": 1.9059, + "step": 734000 + }, + { + "epoch": 0.7610673166630253, + "grad_norm": 1.682363510131836, + "learning_rate": 5e-05, + "loss": 1.9016, + "step": 735000 + }, + { + "epoch": 0.7610673166630253, + "eval_loss": 1.8788481950759888, + "eval_runtime": 23.8164, + "eval_samples_per_second": 2076.595, + "eval_steps_per_second": 8.146, + "step": 735000 + }, + { + "epoch": 0.7621027823999817, + "grad_norm": 1.388289213180542, + "learning_rate": 5e-05, + "loss": 1.9026, + "step": 736000 + }, + { + "epoch": 0.7631382481369383, + "grad_norm": 1.4822994470596313, + "learning_rate": 5e-05, + "loss": 1.8702, + "step": 737000 + }, + { + "epoch": 0.7641737138738948, + "grad_norm": 1.3337068557739258, + "learning_rate": 5e-05, + "loss": 1.8845, + "step": 738000 + }, + { + "epoch": 0.7652091796108512, + "grad_norm": 1.6779500246047974, + "learning_rate": 5e-05, + "loss": 1.8853, + "step": 739000 + }, + { + "epoch": 0.7662446453478078, + "grad_norm": 1.7902178764343262, + "learning_rate": 5e-05, + "loss": 1.8996, + "step": 740000 + }, + { + "epoch": 0.7662446453478078, + "eval_loss": 1.8781044483184814, + "eval_runtime": 24.6681, + "eval_samples_per_second": 2004.9, + "eval_steps_per_second": 7.864, + "step": 740000 + }, + { + "epoch": 0.7672801110847642, + "grad_norm": 1.4279536008834839, + "learning_rate": 5e-05, + "loss": 1.8891, + "step": 741000 + }, + { + "epoch": 0.7683155768217208, + "grad_norm": 1.4116427898406982, + "learning_rate": 5e-05, + "loss": 1.9103, + "step": 742000 + }, + { + "epoch": 0.7693510425586773, + "grad_norm": 1.2686455249786377, + "learning_rate": 5e-05, + "loss": 1.8768, + "step": 743000 + }, + { + "epoch": 0.7703865082956337, + "grad_norm": 1.9228949546813965, + "learning_rate": 5e-05, + "loss": 1.8958, + "step": 744000 + }, + { + "epoch": 0.7714219740325903, + "grad_norm": 1.6242496967315674, + "learning_rate": 5e-05, + "loss": 1.8885, + "step": 745000 + }, + { + "epoch": 0.7714219740325903, + "eval_loss": 1.8771369457244873, + "eval_runtime": 24.8877, + "eval_samples_per_second": 1987.21, + "eval_steps_per_second": 7.795, + "step": 745000 + }, + { + "epoch": 0.7724574397695467, + "grad_norm": 1.3284131288528442, + "learning_rate": 5e-05, + "loss": 1.8882, + "step": 746000 + }, + { + "epoch": 0.7734929055065033, + "grad_norm": 2.2554099559783936, + "learning_rate": 5e-05, + "loss": 1.8915, + "step": 747000 + }, + { + "epoch": 0.7745283712434597, + "grad_norm": 1.8585548400878906, + "learning_rate": 5e-05, + "loss": 1.8901, + "step": 748000 + }, + { + "epoch": 0.7755638369804162, + "grad_norm": 1.4987225532531738, + "learning_rate": 5e-05, + "loss": 1.8888, + "step": 749000 + }, + { + "epoch": 0.7765993027173728, + "grad_norm": 1.642417550086975, + "learning_rate": 5e-05, + "loss": 1.8925, + "step": 750000 + }, + { + "epoch": 0.7765993027173728, + "eval_loss": 1.8830502033233643, + "eval_runtime": 24.3276, + "eval_samples_per_second": 2032.956, + "eval_steps_per_second": 7.974, + "step": 750000 + }, + { + "epoch": 0.7776347684543292, + "grad_norm": 1.4527113437652588, + "learning_rate": 5e-05, + "loss": 1.886, + "step": 751000 + }, + { + "epoch": 0.7786702341912858, + "grad_norm": 1.6376516819000244, + "learning_rate": 5e-05, + "loss": 1.8949, + "step": 752000 + }, + { + "epoch": 0.7797056999282422, + "grad_norm": 1.3692593574523926, + "learning_rate": 5e-05, + "loss": 1.8791, + "step": 753000 + }, + { + "epoch": 0.7807411656651987, + "grad_norm": 1.4702037572860718, + "learning_rate": 5e-05, + "loss": 1.8893, + "step": 754000 + }, + { + "epoch": 0.7817766314021553, + "grad_norm": 1.5268641710281372, + "learning_rate": 5e-05, + "loss": 1.8845, + "step": 755000 + }, + { + "epoch": 0.7817766314021553, + "eval_loss": 1.8741161823272705, + "eval_runtime": 23.2405, + "eval_samples_per_second": 2128.054, + "eval_steps_per_second": 8.348, + "step": 755000 + }, + { + "epoch": 0.7828120971391117, + "grad_norm": 1.5007132291793823, + "learning_rate": 5e-05, + "loss": 1.8857, + "step": 756000 + }, + { + "epoch": 0.7838475628760682, + "grad_norm": 1.7741094827651978, + "learning_rate": 5e-05, + "loss": 1.9001, + "step": 757000 + }, + { + "epoch": 0.7848830286130247, + "grad_norm": 1.789960503578186, + "learning_rate": 5e-05, + "loss": 1.8887, + "step": 758000 + }, + { + "epoch": 0.7859184943499812, + "grad_norm": 1.7130295038223267, + "learning_rate": 5e-05, + "loss": 1.8781, + "step": 759000 + }, + { + "epoch": 0.7869539600869377, + "grad_norm": 1.8921586275100708, + "learning_rate": 5e-05, + "loss": 1.8832, + "step": 760000 + }, + { + "epoch": 0.7869539600869377, + "eval_loss": 1.8890156745910645, + "eval_runtime": 23.6747, + "eval_samples_per_second": 2089.023, + "eval_steps_per_second": 8.194, + "step": 760000 + }, + { + "epoch": 0.7879894258238942, + "grad_norm": 1.4963884353637695, + "learning_rate": 5e-05, + "loss": 1.8822, + "step": 761000 + }, + { + "epoch": 0.7890248915608506, + "grad_norm": 1.3713665008544922, + "learning_rate": 5e-05, + "loss": 1.8814, + "step": 762000 + }, + { + "epoch": 0.7900603572978072, + "grad_norm": 2.0718443393707275, + "learning_rate": 5e-05, + "loss": 1.8794, + "step": 763000 + }, + { + "epoch": 0.7910958230347637, + "grad_norm": 2.4328157901763916, + "learning_rate": 5e-05, + "loss": 1.8772, + "step": 764000 + }, + { + "epoch": 0.7921312887717202, + "grad_norm": 1.6719129085540771, + "learning_rate": 5e-05, + "loss": 1.8859, + "step": 765000 + }, + { + "epoch": 0.7921312887717202, + "eval_loss": 1.878068447113037, + "eval_runtime": 24.513, + "eval_samples_per_second": 2017.582, + "eval_steps_per_second": 7.914, + "step": 765000 + }, + { + "epoch": 0.7931667545086767, + "grad_norm": 1.5634863376617432, + "learning_rate": 5e-05, + "loss": 1.8863, + "step": 766000 + }, + { + "epoch": 0.7942022202456331, + "grad_norm": 2.2358057498931885, + "learning_rate": 5e-05, + "loss": 1.8882, + "step": 767000 + }, + { + "epoch": 0.7952376859825897, + "grad_norm": 1.6573256254196167, + "learning_rate": 5e-05, + "loss": 1.8934, + "step": 768000 + }, + { + "epoch": 0.7962731517195462, + "grad_norm": 1.738838791847229, + "learning_rate": 5e-05, + "loss": 1.8819, + "step": 769000 + }, + { + "epoch": 0.7973086174565027, + "grad_norm": 1.400238275527954, + "learning_rate": 5e-05, + "loss": 1.8845, + "step": 770000 + }, + { + "epoch": 0.7973086174565027, + "eval_loss": 1.871466040611267, + "eval_runtime": 26.2147, + "eval_samples_per_second": 1886.614, + "eval_steps_per_second": 7.4, + "step": 770000 + }, + { + "epoch": 0.7983440831934592, + "grad_norm": 1.506074070930481, + "learning_rate": 5e-05, + "loss": 1.8833, + "step": 771000 + }, + { + "epoch": 0.7993795489304156, + "grad_norm": 1.7725430727005005, + "learning_rate": 5e-05, + "loss": 1.8825, + "step": 772000 + }, + { + "epoch": 0.8004150146673722, + "grad_norm": 1.2773728370666504, + "learning_rate": 5e-05, + "loss": 1.8827, + "step": 773000 + }, + { + "epoch": 0.8014504804043286, + "grad_norm": 1.9510735273361206, + "learning_rate": 5e-05, + "loss": 1.8797, + "step": 774000 + }, + { + "epoch": 0.8024859461412852, + "grad_norm": 1.7016202211380005, + "learning_rate": 5e-05, + "loss": 1.8922, + "step": 775000 + }, + { + "epoch": 0.8024859461412852, + "eval_loss": 1.8798218965530396, + "eval_runtime": 27.438, + "eval_samples_per_second": 1802.497, + "eval_steps_per_second": 7.07, + "step": 775000 + }, + { + "epoch": 0.8035214118782417, + "grad_norm": 1.8934874534606934, + "learning_rate": 5e-05, + "loss": 1.8825, + "step": 776000 + }, + { + "epoch": 0.8045568776151981, + "grad_norm": 1.5149208307266235, + "learning_rate": 5e-05, + "loss": 1.8937, + "step": 777000 + }, + { + "epoch": 0.8055923433521547, + "grad_norm": 1.3739936351776123, + "learning_rate": 5e-05, + "loss": 1.8713, + "step": 778000 + }, + { + "epoch": 0.8066278090891111, + "grad_norm": 1.7757251262664795, + "learning_rate": 5e-05, + "loss": 1.8805, + "step": 779000 + }, + { + "epoch": 0.8076632748260676, + "grad_norm": 1.7812187671661377, + "learning_rate": 5e-05, + "loss": 1.8826, + "step": 780000 + }, + { + "epoch": 0.8076632748260676, + "eval_loss": 1.865774393081665, + "eval_runtime": 27.1199, + "eval_samples_per_second": 1823.645, + "eval_steps_per_second": 7.153, + "step": 780000 + }, + { + "epoch": 0.8086987405630242, + "grad_norm": 1.4904496669769287, + "learning_rate": 5e-05, + "loss": 1.8807, + "step": 781000 + }, + { + "epoch": 0.8097342062999806, + "grad_norm": 1.5127277374267578, + "learning_rate": 5e-05, + "loss": 1.8754, + "step": 782000 + }, + { + "epoch": 0.8107696720369372, + "grad_norm": 1.8934956789016724, + "learning_rate": 5e-05, + "loss": 1.8928, + "step": 783000 + }, + { + "epoch": 0.8118051377738936, + "grad_norm": 1.9401856660842896, + "learning_rate": 5e-05, + "loss": 1.8839, + "step": 784000 + }, + { + "epoch": 0.8128406035108501, + "grad_norm": 1.5946576595306396, + "learning_rate": 5e-05, + "loss": 1.8714, + "step": 785000 + }, + { + "epoch": 0.8128406035108501, + "eval_loss": 1.8689470291137695, + "eval_runtime": 27.4211, + "eval_samples_per_second": 1803.611, + "eval_steps_per_second": 7.075, + "step": 785000 + }, + { + "epoch": 0.8138760692478066, + "grad_norm": 1.7815802097320557, + "learning_rate": 5e-05, + "loss": 1.8843, + "step": 786000 + }, + { + "epoch": 0.8149115349847631, + "grad_norm": 1.684421420097351, + "learning_rate": 5e-05, + "loss": 1.8895, + "step": 787000 + }, + { + "epoch": 0.8159470007217197, + "grad_norm": 1.7290705442428589, + "learning_rate": 5e-05, + "loss": 1.8737, + "step": 788000 + }, + { + "epoch": 0.8169824664586761, + "grad_norm": 1.3579429388046265, + "learning_rate": 5e-05, + "loss": 1.8867, + "step": 789000 + }, + { + "epoch": 0.8180179321956326, + "grad_norm": 1.4426133632659912, + "learning_rate": 5e-05, + "loss": 1.8792, + "step": 790000 + }, + { + "epoch": 0.8180179321956326, + "eval_loss": 1.8706306219100952, + "eval_runtime": 25.2456, + "eval_samples_per_second": 1959.031, + "eval_steps_per_second": 7.684, + "step": 790000 + }, + { + "epoch": 0.8190533979325891, + "grad_norm": 1.7028056383132935, + "learning_rate": 5e-05, + "loss": 1.8876, + "step": 791000 + }, + { + "epoch": 0.8200888636695456, + "grad_norm": 1.6472172737121582, + "learning_rate": 5e-05, + "loss": 1.8853, + "step": 792000 + }, + { + "epoch": 0.8211243294065022, + "grad_norm": 1.6392238140106201, + "learning_rate": 5e-05, + "loss": 1.8763, + "step": 793000 + }, + { + "epoch": 0.8221597951434586, + "grad_norm": 1.2687627077102661, + "learning_rate": 5e-05, + "loss": 1.8754, + "step": 794000 + }, + { + "epoch": 0.8231952608804151, + "grad_norm": 1.4193922281265259, + "learning_rate": 5e-05, + "loss": 1.8778, + "step": 795000 + }, + { + "epoch": 0.8231952608804151, + "eval_loss": 1.8760771751403809, + "eval_runtime": 26.3623, + "eval_samples_per_second": 1876.047, + "eval_steps_per_second": 7.359, + "step": 795000 + }, + { + "epoch": 0.8242307266173716, + "grad_norm": 1.3184908628463745, + "learning_rate": 5e-05, + "loss": 1.8851, + "step": 796000 + }, + { + "epoch": 0.8252661923543281, + "grad_norm": 1.2626770734786987, + "learning_rate": 5e-05, + "loss": 1.8622, + "step": 797000 + }, + { + "epoch": 0.8263016580912846, + "grad_norm": 1.768161416053772, + "learning_rate": 5e-05, + "loss": 1.8822, + "step": 798000 + }, + { + "epoch": 0.8273371238282411, + "grad_norm": 2.2609405517578125, + "learning_rate": 5e-05, + "loss": 1.8776, + "step": 799000 + }, + { + "epoch": 0.8283725895651975, + "grad_norm": 1.4166158437728882, + "learning_rate": 5e-05, + "loss": 1.8834, + "step": 800000 + }, + { + "epoch": 0.8283725895651975, + "eval_loss": 1.874098777770996, + "eval_runtime": 27.908, + "eval_samples_per_second": 1772.144, + "eval_steps_per_second": 6.951, + "step": 800000 + }, + { + "epoch": 0.8294080553021541, + "grad_norm": 1.5788943767547607, + "learning_rate": 5e-05, + "loss": 1.8854, + "step": 801000 + }, + { + "epoch": 0.8304435210391106, + "grad_norm": 1.682399034500122, + "learning_rate": 5e-05, + "loss": 1.89, + "step": 802000 + }, + { + "epoch": 0.831478986776067, + "grad_norm": 1.3524305820465088, + "learning_rate": 5e-05, + "loss": 1.8893, + "step": 803000 + }, + { + "epoch": 0.8325144525130236, + "grad_norm": 2.113900661468506, + "learning_rate": 5e-05, + "loss": 1.8842, + "step": 804000 + }, + { + "epoch": 0.83354991824998, + "grad_norm": 1.3972655534744263, + "learning_rate": 5e-05, + "loss": 1.8868, + "step": 805000 + }, + { + "epoch": 0.83354991824998, + "eval_loss": 1.8680431842803955, + "eval_runtime": 32.3741, + "eval_samples_per_second": 1527.672, + "eval_steps_per_second": 5.992, + "step": 805000 + }, + { + "epoch": 0.8345853839869366, + "grad_norm": 1.905885100364685, + "learning_rate": 5e-05, + "loss": 1.8913, + "step": 806000 + }, + { + "epoch": 0.835620849723893, + "grad_norm": 1.618198037147522, + "learning_rate": 5e-05, + "loss": 1.8756, + "step": 807000 + }, + { + "epoch": 0.8366563154608495, + "grad_norm": 1.955761432647705, + "learning_rate": 5e-05, + "loss": 1.8883, + "step": 808000 + }, + { + "epoch": 0.8376917811978061, + "grad_norm": 1.9344592094421387, + "learning_rate": 5e-05, + "loss": 1.8843, + "step": 809000 + }, + { + "epoch": 0.8387272469347625, + "grad_norm": 1.4266360998153687, + "learning_rate": 5e-05, + "loss": 1.8728, + "step": 810000 + }, + { + "epoch": 0.8387272469347625, + "eval_loss": 1.8683350086212158, + "eval_runtime": 24.7677, + "eval_samples_per_second": 1996.836, + "eval_steps_per_second": 7.833, + "step": 810000 + }, + { + "epoch": 0.8397627126717191, + "grad_norm": 1.6188902854919434, + "learning_rate": 5e-05, + "loss": 1.8817, + "step": 811000 + }, + { + "epoch": 0.8407981784086755, + "grad_norm": 1.6710392236709595, + "learning_rate": 5e-05, + "loss": 1.8732, + "step": 812000 + }, + { + "epoch": 0.841833644145632, + "grad_norm": 1.4408323764801025, + "learning_rate": 5e-05, + "loss": 1.8763, + "step": 813000 + }, + { + "epoch": 0.8428691098825886, + "grad_norm": 1.6970560550689697, + "learning_rate": 5e-05, + "loss": 1.8796, + "step": 814000 + }, + { + "epoch": 0.843904575619545, + "grad_norm": 1.4276736974716187, + "learning_rate": 5e-05, + "loss": 1.8792, + "step": 815000 + }, + { + "epoch": 0.843904575619545, + "eval_loss": 1.8652664422988892, + "eval_runtime": 25.7373, + "eval_samples_per_second": 1921.61, + "eval_steps_per_second": 7.538, + "step": 815000 + }, + { + "epoch": 0.8449400413565016, + "grad_norm": 1.2162322998046875, + "learning_rate": 5e-05, + "loss": 1.8751, + "step": 816000 + }, + { + "epoch": 0.845975507093458, + "grad_norm": 1.6202760934829712, + "learning_rate": 5e-05, + "loss": 1.8746, + "step": 817000 + }, + { + "epoch": 0.8470109728304145, + "grad_norm": 1.5370842218399048, + "learning_rate": 5e-05, + "loss": 1.8679, + "step": 818000 + }, + { + "epoch": 0.848046438567371, + "grad_norm": 1.9520437717437744, + "learning_rate": 5e-05, + "loss": 1.8777, + "step": 819000 + }, + { + "epoch": 0.8490819043043275, + "grad_norm": 1.6644766330718994, + "learning_rate": 5e-05, + "loss": 1.875, + "step": 820000 + }, + { + "epoch": 0.8490819043043275, + "eval_loss": 1.8683598041534424, + "eval_runtime": 25.0415, + "eval_samples_per_second": 1975.005, + "eval_steps_per_second": 7.747, + "step": 820000 + }, + { + "epoch": 0.8501173700412841, + "grad_norm": 1.677675724029541, + "learning_rate": 5e-05, + "loss": 1.8768, + "step": 821000 + }, + { + "epoch": 0.8511528357782405, + "grad_norm": 1.6333279609680176, + "learning_rate": 5e-05, + "loss": 1.8826, + "step": 822000 + }, + { + "epoch": 0.852188301515197, + "grad_norm": 2.3194546699523926, + "learning_rate": 5e-05, + "loss": 1.8713, + "step": 823000 + }, + { + "epoch": 0.8532237672521535, + "grad_norm": 1.945440411567688, + "learning_rate": 5e-05, + "loss": 1.8675, + "step": 824000 + }, + { + "epoch": 0.85425923298911, + "grad_norm": 1.6451483964920044, + "learning_rate": 5e-05, + "loss": 1.8821, + "step": 825000 + }, + { + "epoch": 0.85425923298911, + "eval_loss": 1.8589342832565308, + "eval_runtime": 24.706, + "eval_samples_per_second": 2001.819, + "eval_steps_per_second": 7.852, + "step": 825000 + }, + { + "epoch": 0.8552946987260664, + "grad_norm": 1.882265329360962, + "learning_rate": 5e-05, + "loss": 1.8867, + "step": 826000 + }, + { + "epoch": 0.856330164463023, + "grad_norm": 1.4808956384658813, + "learning_rate": 5e-05, + "loss": 1.8727, + "step": 827000 + }, + { + "epoch": 0.8573656301999795, + "grad_norm": 1.383360505104065, + "learning_rate": 5e-05, + "loss": 1.873, + "step": 828000 + }, + { + "epoch": 0.858401095936936, + "grad_norm": 1.4626834392547607, + "learning_rate": 5e-05, + "loss": 1.8629, + "step": 829000 + }, + { + "epoch": 0.8594365616738925, + "grad_norm": 1.668229341506958, + "learning_rate": 5e-05, + "loss": 1.8877, + "step": 830000 + }, + { + "epoch": 0.8594365616738925, + "eval_loss": 1.8676975965499878, + "eval_runtime": 24.738, + "eval_samples_per_second": 1999.229, + "eval_steps_per_second": 7.842, + "step": 830000 + }, + { + "epoch": 0.8604720274108489, + "grad_norm": 1.9062445163726807, + "learning_rate": 5e-05, + "loss": 1.8692, + "step": 831000 + }, + { + "epoch": 0.8615074931478055, + "grad_norm": 1.6044979095458984, + "learning_rate": 5e-05, + "loss": 1.8696, + "step": 832000 + }, + { + "epoch": 0.862542958884762, + "grad_norm": 1.5497651100158691, + "learning_rate": 5e-05, + "loss": 1.8772, + "step": 833000 + }, + { + "epoch": 0.8635784246217185, + "grad_norm": 2.272531747817993, + "learning_rate": 5e-05, + "loss": 1.8846, + "step": 834000 + }, + { + "epoch": 0.864613890358675, + "grad_norm": 1.831063151359558, + "learning_rate": 5e-05, + "loss": 1.8684, + "step": 835000 + }, + { + "epoch": 0.864613890358675, + "eval_loss": 1.8688569068908691, + "eval_runtime": 24.7995, + "eval_samples_per_second": 1994.276, + "eval_steps_per_second": 7.823, + "step": 835000 + }, + { + "epoch": 0.8656493560956314, + "grad_norm": 1.9683046340942383, + "learning_rate": 5e-05, + "loss": 1.8768, + "step": 836000 + }, + { + "epoch": 0.866684821832588, + "grad_norm": 1.6770648956298828, + "learning_rate": 5e-05, + "loss": 1.8774, + "step": 837000 + }, + { + "epoch": 0.8677202875695444, + "grad_norm": 1.9348267316818237, + "learning_rate": 5e-05, + "loss": 1.8816, + "step": 838000 + }, + { + "epoch": 0.868755753306501, + "grad_norm": 1.5517618656158447, + "learning_rate": 5e-05, + "loss": 1.8761, + "step": 839000 + }, + { + "epoch": 0.8697912190434575, + "grad_norm": 1.8172694444656372, + "learning_rate": 5e-05, + "loss": 1.8711, + "step": 840000 + }, + { + "epoch": 0.8697912190434575, + "eval_loss": 1.8648767471313477, + "eval_runtime": 21.0335, + "eval_samples_per_second": 2351.339, + "eval_steps_per_second": 9.223, + "step": 840000 + }, + { + "epoch": 0.8708266847804139, + "grad_norm": 1.5013890266418457, + "learning_rate": 5e-05, + "loss": 1.8691, + "step": 841000 + }, + { + "epoch": 0.8718621505173705, + "grad_norm": 1.9419187307357788, + "learning_rate": 5e-05, + "loss": 1.8883, + "step": 842000 + }, + { + "epoch": 0.8728976162543269, + "grad_norm": 1.5654350519180298, + "learning_rate": 5e-05, + "loss": 1.8808, + "step": 843000 + }, + { + "epoch": 0.8739330819912835, + "grad_norm": 1.599421739578247, + "learning_rate": 5e-05, + "loss": 1.8724, + "step": 844000 + }, + { + "epoch": 0.87496854772824, + "grad_norm": 1.4926223754882812, + "learning_rate": 5e-05, + "loss": 1.8562, + "step": 845000 + }, + { + "epoch": 0.87496854772824, + "eval_loss": 1.8575286865234375, + "eval_runtime": 21.0794, + "eval_samples_per_second": 2346.223, + "eval_steps_per_second": 9.203, + "step": 845000 + }, + { + "epoch": 0.8760040134651964, + "grad_norm": 1.4079915285110474, + "learning_rate": 5e-05, + "loss": 1.866, + "step": 846000 + }, + { + "epoch": 0.877039479202153, + "grad_norm": 1.6430548429489136, + "learning_rate": 5e-05, + "loss": 1.8701, + "step": 847000 + }, + { + "epoch": 0.8780749449391094, + "grad_norm": 1.7442820072174072, + "learning_rate": 5e-05, + "loss": 1.8801, + "step": 848000 + }, + { + "epoch": 0.879110410676066, + "grad_norm": 1.5199944972991943, + "learning_rate": 5e-05, + "loss": 1.8759, + "step": 849000 + }, + { + "epoch": 0.8801458764130224, + "grad_norm": 1.440382719039917, + "learning_rate": 5e-05, + "loss": 1.8863, + "step": 850000 + }, + { + "epoch": 0.8801458764130224, + "eval_loss": 1.8506907224655151, + "eval_runtime": 22.4651, + "eval_samples_per_second": 2201.505, + "eval_steps_per_second": 8.636, + "step": 850000 + }, + { + "epoch": 0.8811813421499789, + "grad_norm": 1.3661401271820068, + "learning_rate": 5e-05, + "loss": 1.8817, + "step": 851000 + }, + { + "epoch": 0.8822168078869355, + "grad_norm": 1.334959626197815, + "learning_rate": 5e-05, + "loss": 1.8656, + "step": 852000 + }, + { + "epoch": 0.8832522736238919, + "grad_norm": 1.2820968627929688, + "learning_rate": 5e-05, + "loss": 1.8648, + "step": 853000 + }, + { + "epoch": 0.8842877393608484, + "grad_norm": 1.7478554248809814, + "learning_rate": 5e-05, + "loss": 1.8513, + "step": 854000 + }, + { + "epoch": 0.8853232050978049, + "grad_norm": 1.6486462354660034, + "learning_rate": 5e-05, + "loss": 1.8729, + "step": 855000 + }, + { + "epoch": 0.8853232050978049, + "eval_loss": 1.8535994291305542, + "eval_runtime": 21.8631, + "eval_samples_per_second": 2262.122, + "eval_steps_per_second": 8.873, + "step": 855000 + }, + { + "epoch": 0.8863586708347614, + "grad_norm": 1.7058559656143188, + "learning_rate": 5e-05, + "loss": 1.8646, + "step": 856000 + }, + { + "epoch": 0.887394136571718, + "grad_norm": 1.849015712738037, + "learning_rate": 5e-05, + "loss": 1.8641, + "step": 857000 + }, + { + "epoch": 0.8884296023086744, + "grad_norm": 1.493324637413025, + "learning_rate": 5e-05, + "loss": 1.8717, + "step": 858000 + }, + { + "epoch": 0.8894650680456309, + "grad_norm": 1.6665401458740234, + "learning_rate": 5e-05, + "loss": 1.8663, + "step": 859000 + }, + { + "epoch": 0.8905005337825874, + "grad_norm": 1.5283894538879395, + "learning_rate": 5e-05, + "loss": 1.8724, + "step": 860000 + }, + { + "epoch": 0.8905005337825874, + "eval_loss": 1.8644919395446777, + "eval_runtime": 23.2821, + "eval_samples_per_second": 2124.25, + "eval_steps_per_second": 8.333, + "step": 860000 + }, + { + "epoch": 0.8915359995195439, + "grad_norm": 1.7026426792144775, + "learning_rate": 5e-05, + "loss": 1.8713, + "step": 861000 + }, + { + "epoch": 0.8925714652565004, + "grad_norm": 1.3697137832641602, + "learning_rate": 5e-05, + "loss": 1.8691, + "step": 862000 + }, + { + "epoch": 0.8936069309934569, + "grad_norm": 1.3484594821929932, + "learning_rate": 5e-05, + "loss": 1.8807, + "step": 863000 + }, + { + "epoch": 0.8946423967304133, + "grad_norm": 1.725483775138855, + "learning_rate": 5e-05, + "loss": 1.8714, + "step": 864000 + }, + { + "epoch": 0.8956778624673699, + "grad_norm": 1.7059649229049683, + "learning_rate": 5e-05, + "loss": 1.8663, + "step": 865000 + }, + { + "epoch": 0.8956778624673699, + "eval_loss": 1.8639227151870728, + "eval_runtime": 22.1701, + "eval_samples_per_second": 2230.796, + "eval_steps_per_second": 8.751, + "step": 865000 + }, + { + "epoch": 0.8967133282043264, + "grad_norm": 1.9144644737243652, + "learning_rate": 5e-05, + "loss": 1.8609, + "step": 866000 + }, + { + "epoch": 0.8977487939412829, + "grad_norm": 2.074328899383545, + "learning_rate": 5e-05, + "loss": 1.8749, + "step": 867000 + }, + { + "epoch": 0.8987842596782394, + "grad_norm": 1.2168279886245728, + "learning_rate": 5e-05, + "loss": 1.8611, + "step": 868000 + }, + { + "epoch": 0.8998197254151958, + "grad_norm": 1.6179542541503906, + "learning_rate": 5e-05, + "loss": 1.8666, + "step": 869000 + }, + { + "epoch": 0.9008551911521524, + "grad_norm": 1.4882663488388062, + "learning_rate": 5e-05, + "loss": 1.861, + "step": 870000 + }, + { + "epoch": 0.9008551911521524, + "eval_loss": 1.8574090003967285, + "eval_runtime": 22.0353, + "eval_samples_per_second": 2244.441, + "eval_steps_per_second": 8.804, + "step": 870000 + }, + { + "epoch": 0.9018906568891089, + "grad_norm": 1.5473912954330444, + "learning_rate": 5e-05, + "loss": 1.8784, + "step": 871000 + }, + { + "epoch": 0.9029261226260654, + "grad_norm": 1.5128304958343506, + "learning_rate": 5e-05, + "loss": 1.867, + "step": 872000 + }, + { + "epoch": 0.9039615883630219, + "grad_norm": 1.6824181079864502, + "learning_rate": 5e-05, + "loss": 1.8605, + "step": 873000 + }, + { + "epoch": 0.9049970540999783, + "grad_norm": 1.4800105094909668, + "learning_rate": 5e-05, + "loss": 1.8539, + "step": 874000 + }, + { + "epoch": 0.9060325198369349, + "grad_norm": 1.7847933769226074, + "learning_rate": 5e-05, + "loss": 1.8742, + "step": 875000 + }, + { + "epoch": 0.9060325198369349, + "eval_loss": 1.8582428693771362, + "eval_runtime": 22.3155, + "eval_samples_per_second": 2216.265, + "eval_steps_per_second": 8.694, + "step": 875000 + }, + { + "epoch": 0.9070679855738913, + "grad_norm": 1.5011705160140991, + "learning_rate": 5e-05, + "loss": 1.8654, + "step": 876000 + }, + { + "epoch": 0.9081034513108478, + "grad_norm": 2.146315336227417, + "learning_rate": 5e-05, + "loss": 1.8602, + "step": 877000 + }, + { + "epoch": 0.9091389170478044, + "grad_norm": 1.6306260824203491, + "learning_rate": 5e-05, + "loss": 1.8645, + "step": 878000 + }, + { + "epoch": 0.9101743827847608, + "grad_norm": 2.065568685531616, + "learning_rate": 5e-05, + "loss": 1.8591, + "step": 879000 + }, + { + "epoch": 0.9112098485217174, + "grad_norm": 1.3474091291427612, + "learning_rate": 5e-05, + "loss": 1.8722, + "step": 880000 + }, + { + "epoch": 0.9112098485217174, + "eval_loss": 1.8544831275939941, + "eval_runtime": 23.1598, + "eval_samples_per_second": 2135.47, + "eval_steps_per_second": 8.377, + "step": 880000 + }, + { + "epoch": 0.9122453142586738, + "grad_norm": 1.6332334280014038, + "learning_rate": 5e-05, + "loss": 1.8727, + "step": 881000 + }, + { + "epoch": 0.9132807799956303, + "grad_norm": 1.5789443254470825, + "learning_rate": 5e-05, + "loss": 1.8613, + "step": 882000 + }, + { + "epoch": 0.9143162457325869, + "grad_norm": 1.859616994857788, + "learning_rate": 5e-05, + "loss": 1.8518, + "step": 883000 + }, + { + "epoch": 0.9153517114695433, + "grad_norm": 1.6380046606063843, + "learning_rate": 5e-05, + "loss": 1.8592, + "step": 884000 + }, + { + "epoch": 0.9163871772064999, + "grad_norm": 1.8654091358184814, + "learning_rate": 5e-05, + "loss": 1.853, + "step": 885000 + }, + { + "epoch": 0.9163871772064999, + "eval_loss": 1.856461763381958, + "eval_runtime": 22.3583, + "eval_samples_per_second": 2212.021, + "eval_steps_per_second": 8.677, + "step": 885000 + }, + { + "epoch": 0.9174226429434563, + "grad_norm": 1.3017899990081787, + "learning_rate": 5e-05, + "loss": 1.8688, + "step": 886000 + }, + { + "epoch": 0.9184581086804128, + "grad_norm": 2.0168769359588623, + "learning_rate": 5e-05, + "loss": 1.865, + "step": 887000 + }, + { + "epoch": 0.9194935744173693, + "grad_norm": 1.8421316146850586, + "learning_rate": 5e-05, + "loss": 1.8549, + "step": 888000 + }, + { + "epoch": 0.9205290401543258, + "grad_norm": 1.4435760974884033, + "learning_rate": 5e-05, + "loss": 1.8736, + "step": 889000 + }, + { + "epoch": 0.9215645058912824, + "grad_norm": 1.6617441177368164, + "learning_rate": 5e-05, + "loss": 1.8635, + "step": 890000 + }, + { + "epoch": 0.9215645058912824, + "eval_loss": 1.8535727262496948, + "eval_runtime": 21.8604, + "eval_samples_per_second": 2262.398, + "eval_steps_per_second": 8.874, + "step": 890000 + }, + { + "epoch": 0.9225999716282388, + "grad_norm": 1.338428258895874, + "learning_rate": 5e-05, + "loss": 1.8652, + "step": 891000 + }, + { + "epoch": 0.9236354373651953, + "grad_norm": 1.660236120223999, + "learning_rate": 5e-05, + "loss": 1.8601, + "step": 892000 + }, + { + "epoch": 0.9246709031021518, + "grad_norm": 1.5668145418167114, + "learning_rate": 5e-05, + "loss": 1.8639, + "step": 893000 + }, + { + "epoch": 0.9257063688391083, + "grad_norm": 1.8961373567581177, + "learning_rate": 5e-05, + "loss": 1.8649, + "step": 894000 + }, + { + "epoch": 0.9267418345760648, + "grad_norm": 1.5430703163146973, + "learning_rate": 5e-05, + "loss": 1.864, + "step": 895000 + }, + { + "epoch": 0.9267418345760648, + "eval_loss": 1.8523567914962769, + "eval_runtime": 20.6865, + "eval_samples_per_second": 2390.781, + "eval_steps_per_second": 9.378, + "step": 895000 + }, + { + "epoch": 0.9277773003130213, + "grad_norm": 1.4171661138534546, + "learning_rate": 5e-05, + "loss": 1.8722, + "step": 896000 + }, + { + "epoch": 0.9288127660499778, + "grad_norm": 1.6401444673538208, + "learning_rate": 5e-05, + "loss": 1.8591, + "step": 897000 + }, + { + "epoch": 0.9298482317869343, + "grad_norm": 1.6377249956130981, + "learning_rate": 5e-05, + "loss": 1.8618, + "step": 898000 + }, + { + "epoch": 0.9308836975238908, + "grad_norm": 1.9125347137451172, + "learning_rate": 5e-05, + "loss": 1.8666, + "step": 899000 + }, + { + "epoch": 0.9319191632608472, + "grad_norm": 1.4101911783218384, + "learning_rate": 5e-05, + "loss": 1.8697, + "step": 900000 + }, + { + "epoch": 0.9319191632608472, + "eval_loss": 1.8492544889450073, + "eval_runtime": 22.078, + "eval_samples_per_second": 2240.106, + "eval_steps_per_second": 8.787, + "step": 900000 + }, + { + "epoch": 0.9329546289978038, + "grad_norm": 1.3126411437988281, + "learning_rate": 5e-05, + "loss": 1.8554, + "step": 901000 + }, + { + "epoch": 0.9339900947347602, + "grad_norm": 1.494831919670105, + "learning_rate": 5e-05, + "loss": 1.8655, + "step": 902000 + }, + { + "epoch": 0.9350255604717168, + "grad_norm": 1.6403518915176392, + "learning_rate": 5e-05, + "loss": 1.8702, + "step": 903000 + }, + { + "epoch": 0.9360610262086733, + "grad_norm": 1.4031927585601807, + "learning_rate": 5e-05, + "loss": 1.8737, + "step": 904000 + }, + { + "epoch": 0.9370964919456297, + "grad_norm": 1.785649061203003, + "learning_rate": 5e-05, + "loss": 1.8598, + "step": 905000 + }, + { + "epoch": 0.9370964919456297, + "eval_loss": 1.8533316850662231, + "eval_runtime": 21.2318, + "eval_samples_per_second": 2329.385, + "eval_steps_per_second": 9.137, + "step": 905000 + }, + { + "epoch": 0.9381319576825863, + "grad_norm": 1.651497721672058, + "learning_rate": 5e-05, + "loss": 1.8555, + "step": 906000 + }, + { + "epoch": 0.9391674234195427, + "grad_norm": 1.7172114849090576, + "learning_rate": 5e-05, + "loss": 1.8621, + "step": 907000 + }, + { + "epoch": 0.9402028891564993, + "grad_norm": 1.606945276260376, + "learning_rate": 5e-05, + "loss": 1.8664, + "step": 908000 + }, + { + "epoch": 0.9412383548934558, + "grad_norm": 1.686049222946167, + "learning_rate": 5e-05, + "loss": 1.8696, + "step": 909000 + }, + { + "epoch": 0.9422738206304122, + "grad_norm": 1.7451001405715942, + "learning_rate": 5e-05, + "loss": 1.8609, + "step": 910000 + }, + { + "epoch": 0.9422738206304122, + "eval_loss": 1.8576778173446655, + "eval_runtime": 33.0236, + "eval_samples_per_second": 1497.626, + "eval_steps_per_second": 5.875, + "step": 910000 + }, + { + "epoch": 0.9433092863673688, + "grad_norm": 1.8006882667541504, + "learning_rate": 5e-05, + "loss": 1.857, + "step": 911000 + }, + { + "epoch": 0.9443447521043252, + "grad_norm": 1.6737642288208008, + "learning_rate": 5e-05, + "loss": 1.8548, + "step": 912000 + }, + { + "epoch": 0.9453802178412818, + "grad_norm": 1.5626418590545654, + "learning_rate": 5e-05, + "loss": 1.8477, + "step": 913000 + }, + { + "epoch": 0.9464156835782382, + "grad_norm": 1.6796774864196777, + "learning_rate": 5e-05, + "loss": 1.8686, + "step": 914000 + }, + { + "epoch": 0.9474511493151947, + "grad_norm": 1.7748136520385742, + "learning_rate": 5e-05, + "loss": 1.8657, + "step": 915000 + }, + { + "epoch": 0.9474511493151947, + "eval_loss": 1.8437752723693848, + "eval_runtime": 55.0601, + "eval_samples_per_second": 898.237, + "eval_steps_per_second": 3.523, + "step": 915000 + }, + { + "epoch": 0.9484866150521513, + "grad_norm": 1.745834469795227, + "learning_rate": 5e-05, + "loss": 1.8585, + "step": 916000 + }, + { + "epoch": 0.9495220807891077, + "grad_norm": 1.5798673629760742, + "learning_rate": 5e-05, + "loss": 1.8673, + "step": 917000 + }, + { + "epoch": 0.9505575465260643, + "grad_norm": 1.6271969079971313, + "learning_rate": 5e-05, + "loss": 1.8554, + "step": 918000 + }, + { + "epoch": 0.9515930122630207, + "grad_norm": 1.3999176025390625, + "learning_rate": 5e-05, + "loss": 1.8628, + "step": 919000 + }, + { + "epoch": 0.9526284779999772, + "grad_norm": 1.6120986938476562, + "learning_rate": 5e-05, + "loss": 1.8623, + "step": 920000 + }, + { + "epoch": 0.9526284779999772, + "eval_loss": 1.8485383987426758, + "eval_runtime": 19.9677, + "eval_samples_per_second": 2476.855, + "eval_steps_per_second": 9.716, + "step": 920000 + }, + { + "epoch": 0.9536639437369338, + "grad_norm": 1.9189943075180054, + "learning_rate": 5e-05, + "loss": 1.8525, + "step": 921000 + }, + { + "epoch": 0.9546994094738902, + "grad_norm": 1.4193308353424072, + "learning_rate": 5e-05, + "loss": 1.8596, + "step": 922000 + }, + { + "epoch": 0.9557348752108467, + "grad_norm": 1.5809720754623413, + "learning_rate": 5e-05, + "loss": 1.8618, + "step": 923000 + }, + { + "epoch": 0.9567703409478032, + "grad_norm": 1.699925184249878, + "learning_rate": 5e-05, + "loss": 1.8652, + "step": 924000 + }, + { + "epoch": 0.9578058066847597, + "grad_norm": 1.6282320022583008, + "learning_rate": 5e-05, + "loss": 1.8619, + "step": 925000 + }, + { + "epoch": 0.9578058066847597, + "eval_loss": 1.8499970436096191, + "eval_runtime": 22.1279, + "eval_samples_per_second": 2235.052, + "eval_steps_per_second": 8.767, + "step": 925000 + }, + { + "epoch": 0.9588412724217162, + "grad_norm": 1.5954972505569458, + "learning_rate": 5e-05, + "loss": 1.8654, + "step": 926000 + }, + { + "epoch": 0.9598767381586727, + "grad_norm": 1.6539947986602783, + "learning_rate": 5e-05, + "loss": 1.8646, + "step": 927000 + }, + { + "epoch": 0.9609122038956291, + "grad_norm": 1.4428025484085083, + "learning_rate": 5e-05, + "loss": 1.872, + "step": 928000 + }, + { + "epoch": 0.9619476696325857, + "grad_norm": 1.92341148853302, + "learning_rate": 5e-05, + "loss": 1.8572, + "step": 929000 + }, + { + "epoch": 0.9629831353695422, + "grad_norm": 1.6431875228881836, + "learning_rate": 5e-05, + "loss": 1.8657, + "step": 930000 + }, + { + "epoch": 0.9629831353695422, + "eval_loss": 1.848019003868103, + "eval_runtime": 20.3315, + "eval_samples_per_second": 2432.533, + "eval_steps_per_second": 9.542, + "step": 930000 + }, + { + "epoch": 0.9640186011064987, + "grad_norm": 1.5909419059753418, + "learning_rate": 5e-05, + "loss": 1.847, + "step": 931000 + }, + { + "epoch": 0.9650540668434552, + "grad_norm": 1.510219693183899, + "learning_rate": 5e-05, + "loss": 1.8584, + "step": 932000 + }, + { + "epoch": 0.9660895325804116, + "grad_norm": 1.9107452630996704, + "learning_rate": 5e-05, + "loss": 1.8653, + "step": 933000 + }, + { + "epoch": 0.9671249983173682, + "grad_norm": 1.6867653131484985, + "learning_rate": 5e-05, + "loss": 1.8519, + "step": 934000 + }, + { + "epoch": 0.9681604640543247, + "grad_norm": 1.296783685684204, + "learning_rate": 5e-05, + "loss": 1.8547, + "step": 935000 + }, + { + "epoch": 0.9681604640543247, + "eval_loss": 1.8461804389953613, + "eval_runtime": 21.7793, + "eval_samples_per_second": 2270.826, + "eval_steps_per_second": 8.908, + "step": 935000 + }, + { + "epoch": 0.9691959297912812, + "grad_norm": 1.8731415271759033, + "learning_rate": 5e-05, + "loss": 1.8565, + "step": 936000 + }, + { + "epoch": 0.9702313955282377, + "grad_norm": 1.3044154644012451, + "learning_rate": 5e-05, + "loss": 1.8545, + "step": 937000 + }, + { + "epoch": 0.9712668612651941, + "grad_norm": 1.4259470701217651, + "learning_rate": 5e-05, + "loss": 1.8572, + "step": 938000 + }, + { + "epoch": 0.9723023270021507, + "grad_norm": 1.9832919836044312, + "learning_rate": 5e-05, + "loss": 1.8523, + "step": 939000 + }, + { + "epoch": 0.9733377927391071, + "grad_norm": 1.3519784212112427, + "learning_rate": 5e-05, + "loss": 1.8587, + "step": 940000 + }, + { + "epoch": 0.9733377927391071, + "eval_loss": 1.8498455286026, + "eval_runtime": 20.9376, + "eval_samples_per_second": 2362.116, + "eval_steps_per_second": 9.266, + "step": 940000 + }, + { + "epoch": 0.9743732584760637, + "grad_norm": 1.4580146074295044, + "learning_rate": 5e-05, + "loss": 1.8483, + "step": 941000 + }, + { + "epoch": 0.9754087242130202, + "grad_norm": 1.5738118886947632, + "learning_rate": 5e-05, + "loss": 1.8556, + "step": 942000 + }, + { + "epoch": 0.9764441899499766, + "grad_norm": 1.5515236854553223, + "learning_rate": 5e-05, + "loss": 1.8508, + "step": 943000 + }, + { + "epoch": 0.9774796556869332, + "grad_norm": 2.112576723098755, + "learning_rate": 5e-05, + "loss": 1.8533, + "step": 944000 + }, + { + "epoch": 0.9785151214238896, + "grad_norm": 1.3775643110275269, + "learning_rate": 5e-05, + "loss": 1.8551, + "step": 945000 + }, + { + "epoch": 0.9785151214238896, + "eval_loss": 1.84758460521698, + "eval_runtime": 22.0596, + "eval_samples_per_second": 2241.967, + "eval_steps_per_second": 8.794, + "step": 945000 + }, + { + "epoch": 0.9795505871608461, + "grad_norm": 1.6783188581466675, + "learning_rate": 5e-05, + "loss": 1.8588, + "step": 946000 + }, + { + "epoch": 0.9805860528978027, + "grad_norm": 1.7327836751937866, + "learning_rate": 5e-05, + "loss": 1.85, + "step": 947000 + }, + { + "epoch": 0.9816215186347591, + "grad_norm": 1.8131853342056274, + "learning_rate": 5e-05, + "loss": 1.8477, + "step": 948000 + }, + { + "epoch": 0.9826569843717157, + "grad_norm": 1.3955539464950562, + "learning_rate": 5e-05, + "loss": 1.848, + "step": 949000 + }, + { + "epoch": 0.9836924501086721, + "grad_norm": 1.5675952434539795, + "learning_rate": 5e-05, + "loss": 1.8488, + "step": 950000 + }, + { + "epoch": 0.9836924501086721, + "eval_loss": 1.8464607000350952, + "eval_runtime": 22.4257, + "eval_samples_per_second": 2205.372, + "eval_steps_per_second": 8.651, + "step": 950000 + }, + { + "epoch": 0.9847279158456286, + "grad_norm": 1.5982122421264648, + "learning_rate": 5e-05, + "loss": 1.8573, + "step": 951000 + }, + { + "epoch": 0.9857633815825851, + "grad_norm": 1.4788215160369873, + "learning_rate": 5e-05, + "loss": 1.8544, + "step": 952000 + }, + { + "epoch": 0.9867988473195416, + "grad_norm": 1.665727972984314, + "learning_rate": 5e-05, + "loss": 1.8531, + "step": 953000 + }, + { + "epoch": 0.9878343130564982, + "grad_norm": 1.7656697034835815, + "learning_rate": 5e-05, + "loss": 1.8471, + "step": 954000 + }, + { + "epoch": 0.9888697787934546, + "grad_norm": 1.8535689115524292, + "learning_rate": 5e-05, + "loss": 1.8412, + "step": 955000 + }, + { + "epoch": 0.9888697787934546, + "eval_loss": 1.8463835716247559, + "eval_runtime": 21.8162, + "eval_samples_per_second": 2266.983, + "eval_steps_per_second": 8.892, + "step": 955000 + }, + { + "epoch": 0.9899052445304111, + "grad_norm": 2.037118673324585, + "learning_rate": 5e-05, + "loss": 1.8575, + "step": 956000 + }, + { + "epoch": 0.9909407102673676, + "grad_norm": 1.5285453796386719, + "learning_rate": 5e-05, + "loss": 1.8624, + "step": 957000 + }, + { + "epoch": 0.9919761760043241, + "grad_norm": 1.7024654150009155, + "learning_rate": 5e-05, + "loss": 1.8641, + "step": 958000 + }, + { + "epoch": 0.9930116417412806, + "grad_norm": 1.6414403915405273, + "learning_rate": 5e-05, + "loss": 1.8433, + "step": 959000 + }, + { + "epoch": 0.9940471074782371, + "grad_norm": 1.518936276435852, + "learning_rate": 5e-05, + "loss": 1.857, + "step": 960000 + }, + { + "epoch": 0.9940471074782371, + "eval_loss": 1.8379032611846924, + "eval_runtime": 21.6695, + "eval_samples_per_second": 2282.33, + "eval_steps_per_second": 8.953, + "step": 960000 + }, + { + "epoch": 0.9950825732151936, + "grad_norm": 1.7347865104675293, + "learning_rate": 5e-05, + "loss": 1.8598, + "step": 961000 + }, + { + "epoch": 0.9961180389521501, + "grad_norm": 1.986828327178955, + "learning_rate": 5e-05, + "loss": 1.8432, + "step": 962000 + }, + { + "epoch": 0.9971535046891066, + "grad_norm": 1.5118800401687622, + "learning_rate": 5e-05, + "loss": 1.8532, + "step": 963000 + }, + { + "epoch": 0.9981889704260631, + "grad_norm": 1.719777226448059, + "learning_rate": 5e-05, + "loss": 1.8487, + "step": 964000 + }, + { + "epoch": 0.9992244361630196, + "grad_norm": 1.7723966836929321, + "learning_rate": 5e-05, + "loss": 1.8564, + "step": 965000 + }, + { + "epoch": 0.9992244361630196, + "eval_loss": 1.8406885862350464, + "eval_runtime": 21.4295, + "eval_samples_per_second": 2307.889, + "eval_steps_per_second": 9.053, + "step": 965000 + } + ], + "logging_steps": 1000, + "max_steps": 965749, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2642475996312345e+20, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}