diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.6490066225165565, + "eval_steps": 500, + "global_step": 22000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012040939193257074, + "grad_norm": 9.316359519958496, + "learning_rate": 3.0102347983142685e-09, + "loss": 0.9861, + "step": 10 + }, + { + "epoch": 0.002408187838651415, + "grad_norm": 9.306455612182617, + "learning_rate": 6.020469596628537e-09, + "loss": 0.9917, + "step": 20 + }, + { + "epoch": 0.003612281757977122, + "grad_norm": 10.237154006958008, + "learning_rate": 9.030704394942806e-09, + "loss": 0.9875, + "step": 30 + }, + { + "epoch": 0.00481637567730283, + "grad_norm": 10.9087553024292, + "learning_rate": 1.2040939193257074e-08, + "loss": 1.0501, + "step": 40 + }, + { + "epoch": 0.006020469596628537, + "grad_norm": 8.430444717407227, + "learning_rate": 1.5051173991571343e-08, + "loss": 0.9665, + "step": 50 + }, + { + "epoch": 0.007224563515954244, + "grad_norm": 8.640472412109375, + "learning_rate": 1.8061408789885613e-08, + "loss": 1.0249, + "step": 60 + }, + { + "epoch": 0.008428657435279952, + "grad_norm": 8.656485557556152, + "learning_rate": 2.107164358819988e-08, + "loss": 0.9942, + "step": 70 + }, + { + "epoch": 0.00963275135460566, + "grad_norm": 11.827771186828613, + "learning_rate": 2.4081878386514148e-08, + "loss": 1.0359, + "step": 80 + }, + { + "epoch": 0.010836845273931367, + "grad_norm": 7.204784870147705, + "learning_rate": 2.7092113184828417e-08, + "loss": 0.9532, + "step": 90 + }, + { + "epoch": 0.012040939193257074, + "grad_norm": 7.546351432800293, + "learning_rate": 3.010234798314269e-08, + "loss": 1.0236, + "step": 100 + }, + { + "epoch": 0.013245033112582781, + "grad_norm": 10.156384468078613, + "learning_rate": 3.311258278145695e-08, + "loss": 1.0031, + "step": 110 + }, + { + "epoch": 0.014449127031908489, + "grad_norm": 9.753625869750977, + "learning_rate": 3.6122817579771225e-08, + "loss": 1.016, + "step": 120 + }, + { + "epoch": 0.015653220951234198, + "grad_norm": 8.019911766052246, + "learning_rate": 3.9133052378085485e-08, + "loss": 0.9972, + "step": 130 + }, + { + "epoch": 0.016857314870559904, + "grad_norm": 8.1049165725708, + "learning_rate": 4.214328717639976e-08, + "loss": 0.984, + "step": 140 + }, + { + "epoch": 0.018061408789885613, + "grad_norm": 8.9203462600708, + "learning_rate": 4.5153521974714023e-08, + "loss": 0.9811, + "step": 150 + }, + { + "epoch": 0.01926550270921132, + "grad_norm": 8.825779914855957, + "learning_rate": 4.8163756773028296e-08, + "loss": 1.0207, + "step": 160 + }, + { + "epoch": 0.020469596628537028, + "grad_norm": 8.64220142364502, + "learning_rate": 5.117399157134256e-08, + "loss": 0.9971, + "step": 170 + }, + { + "epoch": 0.021673690547862733, + "grad_norm": 8.120660781860352, + "learning_rate": 5.4184226369656835e-08, + "loss": 0.9606, + "step": 180 + }, + { + "epoch": 0.022877784467188442, + "grad_norm": 8.181641578674316, + "learning_rate": 5.71944611679711e-08, + "loss": 0.9516, + "step": 190 + }, + { + "epoch": 0.024081878386514148, + "grad_norm": 7.607439994812012, + "learning_rate": 6.020469596628537e-08, + "loss": 0.9919, + "step": 200 + }, + { + "epoch": 0.025285972305839857, + "grad_norm": 7.434635162353516, + "learning_rate": 6.321493076459963e-08, + "loss": 0.9991, + "step": 210 + }, + { + "epoch": 0.026490066225165563, + "grad_norm": 7.569486141204834, + "learning_rate": 6.62251655629139e-08, + "loss": 0.984, + "step": 220 + }, + { + "epoch": 0.027694160144491272, + "grad_norm": 7.499971389770508, + "learning_rate": 6.923540036122818e-08, + "loss": 0.9598, + "step": 230 + }, + { + "epoch": 0.028898254063816978, + "grad_norm": 6.992701053619385, + "learning_rate": 7.224563515954245e-08, + "loss": 0.905, + "step": 240 + }, + { + "epoch": 0.030102347983142687, + "grad_norm": 6.3157877922058105, + "learning_rate": 7.525586995785671e-08, + "loss": 0.9493, + "step": 250 + }, + { + "epoch": 0.031306441902468396, + "grad_norm": 6.263482570648193, + "learning_rate": 7.826610475617097e-08, + "loss": 0.9501, + "step": 260 + }, + { + "epoch": 0.0325105358217941, + "grad_norm": 6.178393840789795, + "learning_rate": 8.127633955448524e-08, + "loss": 0.9056, + "step": 270 + }, + { + "epoch": 0.03371462974111981, + "grad_norm": 4.896974086761475, + "learning_rate": 8.428657435279951e-08, + "loss": 0.8679, + "step": 280 + }, + { + "epoch": 0.034918723660445516, + "grad_norm": 5.896145820617676, + "learning_rate": 8.729680915111379e-08, + "loss": 0.8658, + "step": 290 + }, + { + "epoch": 0.036122817579771226, + "grad_norm": 5.6855573654174805, + "learning_rate": 9.030704394942805e-08, + "loss": 0.9227, + "step": 300 + }, + { + "epoch": 0.03732691149909693, + "grad_norm": 4.907613277435303, + "learning_rate": 9.331727874774232e-08, + "loss": 0.8581, + "step": 310 + }, + { + "epoch": 0.03853100541842264, + "grad_norm": 6.029637336730957, + "learning_rate": 9.632751354605659e-08, + "loss": 0.8184, + "step": 320 + }, + { + "epoch": 0.039735099337748346, + "grad_norm": 5.0958333015441895, + "learning_rate": 9.933774834437085e-08, + "loss": 0.8524, + "step": 330 + }, + { + "epoch": 0.040939193257074055, + "grad_norm": 6.18320369720459, + "learning_rate": 1.0234798314268512e-07, + "loss": 0.8371, + "step": 340 + }, + { + "epoch": 0.04214328717639976, + "grad_norm": 4.874738693237305, + "learning_rate": 1.0535821794099938e-07, + "loss": 0.8348, + "step": 350 + }, + { + "epoch": 0.04334738109572547, + "grad_norm": 5.273070812225342, + "learning_rate": 1.0836845273931367e-07, + "loss": 0.8286, + "step": 360 + }, + { + "epoch": 0.044551475015051176, + "grad_norm": 5.052524089813232, + "learning_rate": 1.1137868753762793e-07, + "loss": 0.7585, + "step": 370 + }, + { + "epoch": 0.045755568934376885, + "grad_norm": 4.216408729553223, + "learning_rate": 1.143889223359422e-07, + "loss": 0.7869, + "step": 380 + }, + { + "epoch": 0.04695966285370259, + "grad_norm": 5.456339359283447, + "learning_rate": 1.1739915713425646e-07, + "loss": 0.7699, + "step": 390 + }, + { + "epoch": 0.048163756773028296, + "grad_norm": 4.809760093688965, + "learning_rate": 1.2040939193257075e-07, + "loss": 0.763, + "step": 400 + }, + { + "epoch": 0.049367850692354005, + "grad_norm": 4.933152675628662, + "learning_rate": 1.23419626730885e-07, + "loss": 0.7192, + "step": 410 + }, + { + "epoch": 0.050571944611679714, + "grad_norm": 5.025005340576172, + "learning_rate": 1.2642986152919927e-07, + "loss": 0.7092, + "step": 420 + }, + { + "epoch": 0.05177603853100542, + "grad_norm": 4.338512420654297, + "learning_rate": 1.2944009632751355e-07, + "loss": 0.7346, + "step": 430 + }, + { + "epoch": 0.052980132450331126, + "grad_norm": 4.557036399841309, + "learning_rate": 1.324503311258278e-07, + "loss": 0.723, + "step": 440 + }, + { + "epoch": 0.054184226369656835, + "grad_norm": 4.911799907684326, + "learning_rate": 1.3546056592414207e-07, + "loss": 0.7673, + "step": 450 + }, + { + "epoch": 0.055388320288982544, + "grad_norm": 4.063588619232178, + "learning_rate": 1.3847080072245636e-07, + "loss": 0.708, + "step": 460 + }, + { + "epoch": 0.056592414208308246, + "grad_norm": 4.037914752960205, + "learning_rate": 1.4148103552077062e-07, + "loss": 0.7641, + "step": 470 + }, + { + "epoch": 0.057796508127633955, + "grad_norm": 4.673463344573975, + "learning_rate": 1.444912703190849e-07, + "loss": 0.7196, + "step": 480 + }, + { + "epoch": 0.059000602046959665, + "grad_norm": 5.096141815185547, + "learning_rate": 1.4750150511739913e-07, + "loss": 0.689, + "step": 490 + }, + { + "epoch": 0.060204695966285374, + "grad_norm": 4.904088973999023, + "learning_rate": 1.5051173991571342e-07, + "loss": 0.7211, + "step": 500 + }, + { + "epoch": 0.061408789885611076, + "grad_norm": 5.234721660614014, + "learning_rate": 1.535219747140277e-07, + "loss": 0.7091, + "step": 510 + }, + { + "epoch": 0.06261288380493679, + "grad_norm": 4.105505466461182, + "learning_rate": 1.5653220951234194e-07, + "loss": 0.7607, + "step": 520 + }, + { + "epoch": 0.0638169777242625, + "grad_norm": 4.666725158691406, + "learning_rate": 1.5954244431065622e-07, + "loss": 0.7158, + "step": 530 + }, + { + "epoch": 0.0650210716435882, + "grad_norm": 4.976656436920166, + "learning_rate": 1.6255267910897048e-07, + "loss": 0.7401, + "step": 540 + }, + { + "epoch": 0.06622516556291391, + "grad_norm": 5.044974327087402, + "learning_rate": 1.6556291390728477e-07, + "loss": 0.6685, + "step": 550 + }, + { + "epoch": 0.06742925948223961, + "grad_norm": 4.7259368896484375, + "learning_rate": 1.6857314870559903e-07, + "loss": 0.6866, + "step": 560 + }, + { + "epoch": 0.06863335340156532, + "grad_norm": 5.358945369720459, + "learning_rate": 1.715833835039133e-07, + "loss": 0.7059, + "step": 570 + }, + { + "epoch": 0.06983744732089103, + "grad_norm": 5.156592845916748, + "learning_rate": 1.7459361830222757e-07, + "loss": 0.7111, + "step": 580 + }, + { + "epoch": 0.07104154124021674, + "grad_norm": 4.320924282073975, + "learning_rate": 1.7760385310054183e-07, + "loss": 0.6862, + "step": 590 + }, + { + "epoch": 0.07224563515954245, + "grad_norm": 4.59999418258667, + "learning_rate": 1.806140878988561e-07, + "loss": 0.6893, + "step": 600 + }, + { + "epoch": 0.07344972907886815, + "grad_norm": 4.4846391677856445, + "learning_rate": 1.8362432269717038e-07, + "loss": 0.7128, + "step": 610 + }, + { + "epoch": 0.07465382299819386, + "grad_norm": 5.029007911682129, + "learning_rate": 1.8663455749548464e-07, + "loss": 0.7114, + "step": 620 + }, + { + "epoch": 0.07585791691751957, + "grad_norm": 4.288726806640625, + "learning_rate": 1.896447922937989e-07, + "loss": 0.6848, + "step": 630 + }, + { + "epoch": 0.07706201083684527, + "grad_norm": 4.063099384307861, + "learning_rate": 1.9265502709211318e-07, + "loss": 0.659, + "step": 640 + }, + { + "epoch": 0.07826610475617098, + "grad_norm": 4.031120300292969, + "learning_rate": 1.9566526189042744e-07, + "loss": 0.6802, + "step": 650 + }, + { + "epoch": 0.07947019867549669, + "grad_norm": 5.234511852264404, + "learning_rate": 1.986754966887417e-07, + "loss": 0.6685, + "step": 660 + }, + { + "epoch": 0.0806742925948224, + "grad_norm": 5.434250831604004, + "learning_rate": 2.01685731487056e-07, + "loss": 0.6762, + "step": 670 + }, + { + "epoch": 0.08187838651414811, + "grad_norm": 5.326634407043457, + "learning_rate": 2.0469596628537025e-07, + "loss": 0.6951, + "step": 680 + }, + { + "epoch": 0.08308248043347381, + "grad_norm": 3.630930185317993, + "learning_rate": 2.0770620108368453e-07, + "loss": 0.6445, + "step": 690 + }, + { + "epoch": 0.08428657435279951, + "grad_norm": 5.273288726806641, + "learning_rate": 2.1071643588199877e-07, + "loss": 0.65, + "step": 700 + }, + { + "epoch": 0.08549066827212523, + "grad_norm": 4.212562084197998, + "learning_rate": 2.1372667068031305e-07, + "loss": 0.6485, + "step": 710 + }, + { + "epoch": 0.08669476219145093, + "grad_norm": 4.293779373168945, + "learning_rate": 2.1673690547862734e-07, + "loss": 0.6734, + "step": 720 + }, + { + "epoch": 0.08789885611077664, + "grad_norm": 4.917520999908447, + "learning_rate": 2.1974714027694157e-07, + "loss": 0.6593, + "step": 730 + }, + { + "epoch": 0.08910295003010235, + "grad_norm": 4.624716281890869, + "learning_rate": 2.2275737507525586e-07, + "loss": 0.6712, + "step": 740 + }, + { + "epoch": 0.09030704394942805, + "grad_norm": 5.3648552894592285, + "learning_rate": 2.2576760987357014e-07, + "loss": 0.6481, + "step": 750 + }, + { + "epoch": 0.09151113786875377, + "grad_norm": 4.328650951385498, + "learning_rate": 2.287778446718844e-07, + "loss": 0.6418, + "step": 760 + }, + { + "epoch": 0.09271523178807947, + "grad_norm": 4.933085918426514, + "learning_rate": 2.3178807947019866e-07, + "loss": 0.6622, + "step": 770 + }, + { + "epoch": 0.09391932570740517, + "grad_norm": 4.703038215637207, + "learning_rate": 2.3479831426851292e-07, + "loss": 0.6317, + "step": 780 + }, + { + "epoch": 0.09512341962673089, + "grad_norm": 4.468968391418457, + "learning_rate": 2.378085490668272e-07, + "loss": 0.6431, + "step": 790 + }, + { + "epoch": 0.09632751354605659, + "grad_norm": 4.49053430557251, + "learning_rate": 2.408187838651415e-07, + "loss": 0.6212, + "step": 800 + }, + { + "epoch": 0.0975316074653823, + "grad_norm": 4.350528240203857, + "learning_rate": 2.438290186634557e-07, + "loss": 0.623, + "step": 810 + }, + { + "epoch": 0.09873570138470801, + "grad_norm": 4.772309303283691, + "learning_rate": 2.4683925346177e-07, + "loss": 0.6268, + "step": 820 + }, + { + "epoch": 0.09993979530403371, + "grad_norm": 5.437624931335449, + "learning_rate": 2.498494882600843e-07, + "loss": 0.6497, + "step": 830 + }, + { + "epoch": 0.10114388922335943, + "grad_norm": 4.474155902862549, + "learning_rate": 2.5285972305839853e-07, + "loss": 0.6092, + "step": 840 + }, + { + "epoch": 0.10234798314268513, + "grad_norm": 4.417370796203613, + "learning_rate": 2.558699578567128e-07, + "loss": 0.6587, + "step": 850 + }, + { + "epoch": 0.10355207706201083, + "grad_norm": 5.5498456954956055, + "learning_rate": 2.588801926550271e-07, + "loss": 0.6564, + "step": 860 + }, + { + "epoch": 0.10475617098133655, + "grad_norm": 5.326292514801025, + "learning_rate": 2.6189042745334134e-07, + "loss": 0.6333, + "step": 870 + }, + { + "epoch": 0.10596026490066225, + "grad_norm": 4.284069538116455, + "learning_rate": 2.649006622516556e-07, + "loss": 0.6325, + "step": 880 + }, + { + "epoch": 0.10716435881998795, + "grad_norm": 4.672844886779785, + "learning_rate": 2.679108970499699e-07, + "loss": 0.6046, + "step": 890 + }, + { + "epoch": 0.10836845273931367, + "grad_norm": 4.223860740661621, + "learning_rate": 2.7092113184828414e-07, + "loss": 0.6094, + "step": 900 + }, + { + "epoch": 0.10957254665863937, + "grad_norm": 4.813838005065918, + "learning_rate": 2.739313666465984e-07, + "loss": 0.6332, + "step": 910 + }, + { + "epoch": 0.11077664057796509, + "grad_norm": 3.5245296955108643, + "learning_rate": 2.769416014449127e-07, + "loss": 0.6161, + "step": 920 + }, + { + "epoch": 0.11198073449729079, + "grad_norm": 4.577372074127197, + "learning_rate": 2.7995183624322695e-07, + "loss": 0.6254, + "step": 930 + }, + { + "epoch": 0.11318482841661649, + "grad_norm": 4.295224666595459, + "learning_rate": 2.8296207104154123e-07, + "loss": 0.6089, + "step": 940 + }, + { + "epoch": 0.11438892233594221, + "grad_norm": 4.899755477905273, + "learning_rate": 2.8597230583985546e-07, + "loss": 0.6255, + "step": 950 + }, + { + "epoch": 0.11559301625526791, + "grad_norm": 5.047530651092529, + "learning_rate": 2.889825406381698e-07, + "loss": 0.6273, + "step": 960 + }, + { + "epoch": 0.11679711017459361, + "grad_norm": 4.75305700302124, + "learning_rate": 2.9199277543648404e-07, + "loss": 0.6277, + "step": 970 + }, + { + "epoch": 0.11800120409391933, + "grad_norm": 5.476251602172852, + "learning_rate": 2.9500301023479827e-07, + "loss": 0.6121, + "step": 980 + }, + { + "epoch": 0.11920529801324503, + "grad_norm": 6.20451021194458, + "learning_rate": 2.980132450331126e-07, + "loss": 0.6121, + "step": 990 + }, + { + "epoch": 0.12040939193257075, + "grad_norm": 4.61058235168457, + "learning_rate": 3.0102347983142684e-07, + "loss": 0.5862, + "step": 1000 + }, + { + "epoch": 0.12161348585189645, + "grad_norm": 4.537725925445557, + "learning_rate": 3.0403371462974107e-07, + "loss": 0.6186, + "step": 1010 + }, + { + "epoch": 0.12281757977122215, + "grad_norm": 4.347688674926758, + "learning_rate": 3.070439494280554e-07, + "loss": 0.6127, + "step": 1020 + }, + { + "epoch": 0.12402167369054787, + "grad_norm": 4.965167045593262, + "learning_rate": 3.1005418422636965e-07, + "loss": 0.6026, + "step": 1030 + }, + { + "epoch": 0.12522576760987358, + "grad_norm": 4.610491752624512, + "learning_rate": 3.130644190246839e-07, + "loss": 0.6327, + "step": 1040 + }, + { + "epoch": 0.12642986152919927, + "grad_norm": 5.292304039001465, + "learning_rate": 3.160746538229982e-07, + "loss": 0.5972, + "step": 1050 + }, + { + "epoch": 0.127633955448525, + "grad_norm": 6.372762680053711, + "learning_rate": 3.1908488862131245e-07, + "loss": 0.6393, + "step": 1060 + }, + { + "epoch": 0.1288380493678507, + "grad_norm": 5.56066370010376, + "learning_rate": 3.220951234196267e-07, + "loss": 0.6196, + "step": 1070 + }, + { + "epoch": 0.1300421432871764, + "grad_norm": 4.777896881103516, + "learning_rate": 3.2510535821794097e-07, + "loss": 0.6217, + "step": 1080 + }, + { + "epoch": 0.1312462372065021, + "grad_norm": 4.9745683670043945, + "learning_rate": 3.2811559301625525e-07, + "loss": 0.6189, + "step": 1090 + }, + { + "epoch": 0.13245033112582782, + "grad_norm": 3.71576189994812, + "learning_rate": 3.3112582781456954e-07, + "loss": 0.5893, + "step": 1100 + }, + { + "epoch": 0.1336544250451535, + "grad_norm": 4.458312034606934, + "learning_rate": 3.3413606261288377e-07, + "loss": 0.5951, + "step": 1110 + }, + { + "epoch": 0.13485851896447923, + "grad_norm": 4.835500240325928, + "learning_rate": 3.3714629741119806e-07, + "loss": 0.5791, + "step": 1120 + }, + { + "epoch": 0.13606261288380495, + "grad_norm": 4.516515254974365, + "learning_rate": 3.4015653220951235e-07, + "loss": 0.5759, + "step": 1130 + }, + { + "epoch": 0.13726670680313063, + "grad_norm": 5.564052104949951, + "learning_rate": 3.431667670078266e-07, + "loss": 0.5791, + "step": 1140 + }, + { + "epoch": 0.13847080072245635, + "grad_norm": 5.586264610290527, + "learning_rate": 3.4617700180614086e-07, + "loss": 0.6142, + "step": 1150 + }, + { + "epoch": 0.13967489464178207, + "grad_norm": 4.408708572387695, + "learning_rate": 3.4918723660445515e-07, + "loss": 0.617, + "step": 1160 + }, + { + "epoch": 0.14087898856110775, + "grad_norm": 4.4068403244018555, + "learning_rate": 3.521974714027694e-07, + "loss": 0.6099, + "step": 1170 + }, + { + "epoch": 0.14208308248043347, + "grad_norm": 3.947399854660034, + "learning_rate": 3.5520770620108367e-07, + "loss": 0.5555, + "step": 1180 + }, + { + "epoch": 0.1432871763997592, + "grad_norm": 5.264540195465088, + "learning_rate": 3.5821794099939795e-07, + "loss": 0.5556, + "step": 1190 + }, + { + "epoch": 0.1444912703190849, + "grad_norm": 4.486605644226074, + "learning_rate": 3.612281757977122e-07, + "loss": 0.5997, + "step": 1200 + }, + { + "epoch": 0.1456953642384106, + "grad_norm": 6.195891857147217, + "learning_rate": 3.642384105960264e-07, + "loss": 0.6104, + "step": 1210 + }, + { + "epoch": 0.1468994581577363, + "grad_norm": 4.5443572998046875, + "learning_rate": 3.6724864539434076e-07, + "loss": 0.5806, + "step": 1220 + }, + { + "epoch": 0.14810355207706202, + "grad_norm": 4.380715370178223, + "learning_rate": 3.70258880192655e-07, + "loss": 0.5759, + "step": 1230 + }, + { + "epoch": 0.1493076459963877, + "grad_norm": 5.033191680908203, + "learning_rate": 3.732691149909693e-07, + "loss": 0.5782, + "step": 1240 + }, + { + "epoch": 0.15051173991571343, + "grad_norm": 4.244385719299316, + "learning_rate": 3.7627934978928356e-07, + "loss": 0.5658, + "step": 1250 + }, + { + "epoch": 0.15171583383503914, + "grad_norm": 4.332985877990723, + "learning_rate": 3.792895845875978e-07, + "loss": 0.5702, + "step": 1260 + }, + { + "epoch": 0.15291992775436483, + "grad_norm": 4.5175628662109375, + "learning_rate": 3.822998193859121e-07, + "loss": 0.5588, + "step": 1270 + }, + { + "epoch": 0.15412402167369055, + "grad_norm": 4.519990921020508, + "learning_rate": 3.8531005418422637e-07, + "loss": 0.5871, + "step": 1280 + }, + { + "epoch": 0.15532811559301626, + "grad_norm": 4.500414848327637, + "learning_rate": 3.883202889825406e-07, + "loss": 0.5977, + "step": 1290 + }, + { + "epoch": 0.15653220951234195, + "grad_norm": 4.714526653289795, + "learning_rate": 3.913305237808549e-07, + "loss": 0.5647, + "step": 1300 + }, + { + "epoch": 0.15773630343166767, + "grad_norm": 4.869201183319092, + "learning_rate": 3.9434075857916917e-07, + "loss": 0.5816, + "step": 1310 + }, + { + "epoch": 0.15894039735099338, + "grad_norm": 5.167849540710449, + "learning_rate": 3.973509933774834e-07, + "loss": 0.5633, + "step": 1320 + }, + { + "epoch": 0.16014449127031907, + "grad_norm": 4.805886745452881, + "learning_rate": 4.003612281757977e-07, + "loss": 0.5858, + "step": 1330 + }, + { + "epoch": 0.1613485851896448, + "grad_norm": 4.569708824157715, + "learning_rate": 4.03371462974112e-07, + "loss": 0.5729, + "step": 1340 + }, + { + "epoch": 0.1625526791089705, + "grad_norm": 4.649074554443359, + "learning_rate": 4.0638169777242626e-07, + "loss": 0.5904, + "step": 1350 + }, + { + "epoch": 0.16375677302829622, + "grad_norm": 4.956695556640625, + "learning_rate": 4.093919325707405e-07, + "loss": 0.5743, + "step": 1360 + }, + { + "epoch": 0.1649608669476219, + "grad_norm": 5.056834697723389, + "learning_rate": 4.1240216736905473e-07, + "loss": 0.5903, + "step": 1370 + }, + { + "epoch": 0.16616496086694763, + "grad_norm": 4.751232624053955, + "learning_rate": 4.1541240216736907e-07, + "loss": 0.5697, + "step": 1380 + }, + { + "epoch": 0.16736905478627334, + "grad_norm": 4.0161027908325195, + "learning_rate": 4.184226369656833e-07, + "loss": 0.5588, + "step": 1390 + }, + { + "epoch": 0.16857314870559903, + "grad_norm": 4.591194152832031, + "learning_rate": 4.2143287176399753e-07, + "loss": 0.5792, + "step": 1400 + }, + { + "epoch": 0.16977724262492475, + "grad_norm": 5.218972206115723, + "learning_rate": 4.2444310656231187e-07, + "loss": 0.5793, + "step": 1410 + }, + { + "epoch": 0.17098133654425046, + "grad_norm": 4.32102108001709, + "learning_rate": 4.274533413606261e-07, + "loss": 0.57, + "step": 1420 + }, + { + "epoch": 0.17218543046357615, + "grad_norm": 4.359175205230713, + "learning_rate": 4.3046357615894034e-07, + "loss": 0.5675, + "step": 1430 + }, + { + "epoch": 0.17338952438290187, + "grad_norm": 5.192026615142822, + "learning_rate": 4.334738109572547e-07, + "loss": 0.5668, + "step": 1440 + }, + { + "epoch": 0.17459361830222758, + "grad_norm": 4.002780914306641, + "learning_rate": 4.364840457555689e-07, + "loss": 0.5787, + "step": 1450 + }, + { + "epoch": 0.17579771222155327, + "grad_norm": 5.319111347198486, + "learning_rate": 4.3949428055388314e-07, + "loss": 0.5734, + "step": 1460 + }, + { + "epoch": 0.177001806140879, + "grad_norm": 4.700523376464844, + "learning_rate": 4.425045153521975e-07, + "loss": 0.5754, + "step": 1470 + }, + { + "epoch": 0.1782059000602047, + "grad_norm": 4.4386372566223145, + "learning_rate": 4.455147501505117e-07, + "loss": 0.5459, + "step": 1480 + }, + { + "epoch": 0.1794099939795304, + "grad_norm": 4.084826946258545, + "learning_rate": 4.48524984948826e-07, + "loss": 0.5399, + "step": 1490 + }, + { + "epoch": 0.1806140878988561, + "grad_norm": 4.401342391967773, + "learning_rate": 4.515352197471403e-07, + "loss": 0.573, + "step": 1500 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 4.5059685707092285, + "learning_rate": 4.545454545454545e-07, + "loss": 0.5724, + "step": 1510 + }, + { + "epoch": 0.18302227573750754, + "grad_norm": 5.070437431335449, + "learning_rate": 4.575556893437688e-07, + "loss": 0.5711, + "step": 1520 + }, + { + "epoch": 0.18422636965683323, + "grad_norm": 4.188956260681152, + "learning_rate": 4.6056592414208304e-07, + "loss": 0.5498, + "step": 1530 + }, + { + "epoch": 0.18543046357615894, + "grad_norm": 4.391158580780029, + "learning_rate": 4.635761589403973e-07, + "loss": 0.5602, + "step": 1540 + }, + { + "epoch": 0.18663455749548466, + "grad_norm": 5.272259712219238, + "learning_rate": 4.665863937387116e-07, + "loss": 0.5748, + "step": 1550 + }, + { + "epoch": 0.18783865141481035, + "grad_norm": 4.982473373413086, + "learning_rate": 4.6959662853702584e-07, + "loss": 0.5584, + "step": 1560 + }, + { + "epoch": 0.18904274533413606, + "grad_norm": 5.263506889343262, + "learning_rate": 4.7260686333534013e-07, + "loss": 0.5828, + "step": 1570 + }, + { + "epoch": 0.19024683925346178, + "grad_norm": 4.1373724937438965, + "learning_rate": 4.756170981336544e-07, + "loss": 0.5494, + "step": 1580 + }, + { + "epoch": 0.19145093317278747, + "grad_norm": 4.439697265625, + "learning_rate": 4.786273329319686e-07, + "loss": 0.5522, + "step": 1590 + }, + { + "epoch": 0.19265502709211318, + "grad_norm": 4.79713249206543, + "learning_rate": 4.81637567730283e-07, + "loss": 0.5058, + "step": 1600 + }, + { + "epoch": 0.1938591210114389, + "grad_norm": 3.973453998565674, + "learning_rate": 4.846478025285972e-07, + "loss": 0.5471, + "step": 1610 + }, + { + "epoch": 0.1950632149307646, + "grad_norm": 4.748741149902344, + "learning_rate": 4.876580373269115e-07, + "loss": 0.5768, + "step": 1620 + }, + { + "epoch": 0.1962673088500903, + "grad_norm": 5.98441743850708, + "learning_rate": 4.906682721252258e-07, + "loss": 0.5497, + "step": 1630 + }, + { + "epoch": 0.19747140276941602, + "grad_norm": 5.55325174331665, + "learning_rate": 4.9367850692354e-07, + "loss": 0.5595, + "step": 1640 + }, + { + "epoch": 0.1986754966887417, + "grad_norm": 5.114386081695557, + "learning_rate": 4.966887417218543e-07, + "loss": 0.5635, + "step": 1650 + }, + { + "epoch": 0.19987959060806743, + "grad_norm": 4.869389533996582, + "learning_rate": 4.996989765201686e-07, + "loss": 0.5409, + "step": 1660 + }, + { + "epoch": 0.20108368452739314, + "grad_norm": 4.4507222175598145, + "learning_rate": 5.027092113184828e-07, + "loss": 0.5598, + "step": 1670 + }, + { + "epoch": 0.20228777844671886, + "grad_norm": 4.574100494384766, + "learning_rate": 5.057194461167971e-07, + "loss": 0.5432, + "step": 1680 + }, + { + "epoch": 0.20349187236604455, + "grad_norm": 4.581476211547852, + "learning_rate": 5.087296809151114e-07, + "loss": 0.5509, + "step": 1690 + }, + { + "epoch": 0.20469596628537026, + "grad_norm": 4.631548881530762, + "learning_rate": 5.117399157134256e-07, + "loss": 0.5712, + "step": 1700 + }, + { + "epoch": 0.20590006020469598, + "grad_norm": 5.006454944610596, + "learning_rate": 5.147501505117399e-07, + "loss": 0.5586, + "step": 1710 + }, + { + "epoch": 0.20710415412402167, + "grad_norm": 4.4788408279418945, + "learning_rate": 5.177603853100542e-07, + "loss": 0.5543, + "step": 1720 + }, + { + "epoch": 0.20830824804334738, + "grad_norm": 4.614450931549072, + "learning_rate": 5.207706201083684e-07, + "loss": 0.5677, + "step": 1730 + }, + { + "epoch": 0.2095123419626731, + "grad_norm": 4.377712249755859, + "learning_rate": 5.237808549066827e-07, + "loss": 0.5399, + "step": 1740 + }, + { + "epoch": 0.2107164358819988, + "grad_norm": 6.157577991485596, + "learning_rate": 5.26791089704997e-07, + "loss": 0.5288, + "step": 1750 + }, + { + "epoch": 0.2119205298013245, + "grad_norm": 4.206299781799316, + "learning_rate": 5.298013245033112e-07, + "loss": 0.5308, + "step": 1760 + }, + { + "epoch": 0.21312462372065022, + "grad_norm": 4.296496868133545, + "learning_rate": 5.328115593016255e-07, + "loss": 0.552, + "step": 1770 + }, + { + "epoch": 0.2143287176399759, + "grad_norm": 4.474640846252441, + "learning_rate": 5.358217940999398e-07, + "loss": 0.5505, + "step": 1780 + }, + { + "epoch": 0.21553281155930162, + "grad_norm": 4.762406349182129, + "learning_rate": 5.38832028898254e-07, + "loss": 0.5669, + "step": 1790 + }, + { + "epoch": 0.21673690547862734, + "grad_norm": 4.40052604675293, + "learning_rate": 5.418422636965683e-07, + "loss": 0.5386, + "step": 1800 + }, + { + "epoch": 0.21794099939795303, + "grad_norm": 4.364424228668213, + "learning_rate": 5.448524984948826e-07, + "loss": 0.5446, + "step": 1810 + }, + { + "epoch": 0.21914509331727874, + "grad_norm": 5.686670780181885, + "learning_rate": 5.478627332931969e-07, + "loss": 0.5708, + "step": 1820 + }, + { + "epoch": 0.22034918723660446, + "grad_norm": 6.244655132293701, + "learning_rate": 5.508729680915111e-07, + "loss": 0.5353, + "step": 1830 + }, + { + "epoch": 0.22155328115593018, + "grad_norm": 5.4936323165893555, + "learning_rate": 5.538832028898254e-07, + "loss": 0.5486, + "step": 1840 + }, + { + "epoch": 0.22275737507525586, + "grad_norm": 4.955344200134277, + "learning_rate": 5.568934376881397e-07, + "loss": 0.5142, + "step": 1850 + }, + { + "epoch": 0.22396146899458158, + "grad_norm": 4.333896636962891, + "learning_rate": 5.599036724864539e-07, + "loss": 0.5432, + "step": 1860 + }, + { + "epoch": 0.2251655629139073, + "grad_norm": 4.568367958068848, + "learning_rate": 5.629139072847681e-07, + "loss": 0.5351, + "step": 1870 + }, + { + "epoch": 0.22636965683323299, + "grad_norm": 5.548391342163086, + "learning_rate": 5.659241420830825e-07, + "loss": 0.5053, + "step": 1880 + }, + { + "epoch": 0.2275737507525587, + "grad_norm": 4.526470184326172, + "learning_rate": 5.689343768813967e-07, + "loss": 0.5494, + "step": 1890 + }, + { + "epoch": 0.22877784467188442, + "grad_norm": 4.453249454498291, + "learning_rate": 5.719446116797109e-07, + "loss": 0.5397, + "step": 1900 + }, + { + "epoch": 0.2299819385912101, + "grad_norm": 7.503538131713867, + "learning_rate": 5.749548464780253e-07, + "loss": 0.5232, + "step": 1910 + }, + { + "epoch": 0.23118603251053582, + "grad_norm": 5.740428924560547, + "learning_rate": 5.779650812763396e-07, + "loss": 0.5426, + "step": 1920 + }, + { + "epoch": 0.23239012642986154, + "grad_norm": 5.185967445373535, + "learning_rate": 5.809753160746537e-07, + "loss": 0.5277, + "step": 1930 + }, + { + "epoch": 0.23359422034918723, + "grad_norm": 5.1867547035217285, + "learning_rate": 5.839855508729681e-07, + "loss": 0.5233, + "step": 1940 + }, + { + "epoch": 0.23479831426851294, + "grad_norm": 4.812213897705078, + "learning_rate": 5.869957856712824e-07, + "loss": 0.535, + "step": 1950 + }, + { + "epoch": 0.23600240818783866, + "grad_norm": 5.038625240325928, + "learning_rate": 5.900060204695965e-07, + "loss": 0.5365, + "step": 1960 + }, + { + "epoch": 0.23720650210716435, + "grad_norm": 4.050044536590576, + "learning_rate": 5.930162552679109e-07, + "loss": 0.5145, + "step": 1970 + }, + { + "epoch": 0.23841059602649006, + "grad_norm": 4.956125736236572, + "learning_rate": 5.960264900662252e-07, + "loss": 0.5141, + "step": 1980 + }, + { + "epoch": 0.23961468994581578, + "grad_norm": 4.40023136138916, + "learning_rate": 5.990367248645393e-07, + "loss": 0.544, + "step": 1990 + }, + { + "epoch": 0.2408187838651415, + "grad_norm": 5.268930912017822, + "learning_rate": 6.020469596628537e-07, + "loss": 0.5514, + "step": 2000 + }, + { + "epoch": 0.24202287778446718, + "grad_norm": 3.9441418647766113, + "learning_rate": 6.05057194461168e-07, + "loss": 0.5368, + "step": 2010 + }, + { + "epoch": 0.2432269717037929, + "grad_norm": 4.060418605804443, + "learning_rate": 6.080674292594821e-07, + "loss": 0.5228, + "step": 2020 + }, + { + "epoch": 0.24443106562311862, + "grad_norm": 4.1477861404418945, + "learning_rate": 6.110776640577965e-07, + "loss": 0.5221, + "step": 2030 + }, + { + "epoch": 0.2456351595424443, + "grad_norm": 5.319125175476074, + "learning_rate": 6.140878988561108e-07, + "loss": 0.5441, + "step": 2040 + }, + { + "epoch": 0.24683925346177002, + "grad_norm": 4.920033931732178, + "learning_rate": 6.17098133654425e-07, + "loss": 0.5307, + "step": 2050 + }, + { + "epoch": 0.24804334738109574, + "grad_norm": 5.167773246765137, + "learning_rate": 6.201083684527393e-07, + "loss": 0.5304, + "step": 2060 + }, + { + "epoch": 0.24924744130042142, + "grad_norm": 5.3018879890441895, + "learning_rate": 6.231186032510536e-07, + "loss": 0.5356, + "step": 2070 + }, + { + "epoch": 0.25045153521974717, + "grad_norm": 4.822166919708252, + "learning_rate": 6.261288380493678e-07, + "loss": 0.513, + "step": 2080 + }, + { + "epoch": 0.25165562913907286, + "grad_norm": 4.957582473754883, + "learning_rate": 6.291390728476821e-07, + "loss": 0.5069, + "step": 2090 + }, + { + "epoch": 0.25285972305839854, + "grad_norm": 6.180065155029297, + "learning_rate": 6.321493076459964e-07, + "loss": 0.5329, + "step": 2100 + }, + { + "epoch": 0.2540638169777243, + "grad_norm": 5.123517990112305, + "learning_rate": 6.351595424443106e-07, + "loss": 0.5169, + "step": 2110 + }, + { + "epoch": 0.25526791089705, + "grad_norm": 5.372180938720703, + "learning_rate": 6.381697772426249e-07, + "loss": 0.508, + "step": 2120 + }, + { + "epoch": 0.25647200481637566, + "grad_norm": 3.907548189163208, + "learning_rate": 6.411800120409392e-07, + "loss": 0.5082, + "step": 2130 + }, + { + "epoch": 0.2576760987357014, + "grad_norm": 4.107047080993652, + "learning_rate": 6.441902468392534e-07, + "loss": 0.5257, + "step": 2140 + }, + { + "epoch": 0.2588801926550271, + "grad_norm": 5.055625915527344, + "learning_rate": 6.472004816375677e-07, + "loss": 0.5458, + "step": 2150 + }, + { + "epoch": 0.2600842865743528, + "grad_norm": 5.573007106781006, + "learning_rate": 6.502107164358819e-07, + "loss": 0.5178, + "step": 2160 + }, + { + "epoch": 0.26128838049367853, + "grad_norm": 4.955606460571289, + "learning_rate": 6.532209512341962e-07, + "loss": 0.5355, + "step": 2170 + }, + { + "epoch": 0.2624924744130042, + "grad_norm": 4.537413120269775, + "learning_rate": 6.562311860325105e-07, + "loss": 0.5393, + "step": 2180 + }, + { + "epoch": 0.2636965683323299, + "grad_norm": 5.761811256408691, + "learning_rate": 6.592414208308247e-07, + "loss": 0.5497, + "step": 2190 + }, + { + "epoch": 0.26490066225165565, + "grad_norm": 3.865335464477539, + "learning_rate": 6.622516556291391e-07, + "loss": 0.4914, + "step": 2200 + }, + { + "epoch": 0.26610475617098134, + "grad_norm": 4.600432872772217, + "learning_rate": 6.652618904274533e-07, + "loss": 0.5099, + "step": 2210 + }, + { + "epoch": 0.267308850090307, + "grad_norm": 4.737097263336182, + "learning_rate": 6.682721252257675e-07, + "loss": 0.5236, + "step": 2220 + }, + { + "epoch": 0.26851294400963277, + "grad_norm": 4.7886247634887695, + "learning_rate": 6.712823600240819e-07, + "loss": 0.5152, + "step": 2230 + }, + { + "epoch": 0.26971703792895846, + "grad_norm": 6.00905179977417, + "learning_rate": 6.742925948223961e-07, + "loss": 0.5369, + "step": 2240 + }, + { + "epoch": 0.27092113184828415, + "grad_norm": 5.080295085906982, + "learning_rate": 6.773028296207104e-07, + "loss": 0.5135, + "step": 2250 + }, + { + "epoch": 0.2721252257676099, + "grad_norm": 5.130943775177002, + "learning_rate": 6.803130644190247e-07, + "loss": 0.4921, + "step": 2260 + }, + { + "epoch": 0.2733293196869356, + "grad_norm": 4.8161187171936035, + "learning_rate": 6.833232992173389e-07, + "loss": 0.5243, + "step": 2270 + }, + { + "epoch": 0.27453341360626127, + "grad_norm": 5.960630416870117, + "learning_rate": 6.863335340156532e-07, + "loss": 0.525, + "step": 2280 + }, + { + "epoch": 0.275737507525587, + "grad_norm": 6.012716770172119, + "learning_rate": 6.893437688139675e-07, + "loss": 0.5126, + "step": 2290 + }, + { + "epoch": 0.2769416014449127, + "grad_norm": 4.913167476654053, + "learning_rate": 6.923540036122817e-07, + "loss": 0.531, + "step": 2300 + }, + { + "epoch": 0.2781456953642384, + "grad_norm": 5.190576076507568, + "learning_rate": 6.95364238410596e-07, + "loss": 0.5147, + "step": 2310 + }, + { + "epoch": 0.27934978928356413, + "grad_norm": 4.0760602951049805, + "learning_rate": 6.983744732089103e-07, + "loss": 0.5135, + "step": 2320 + }, + { + "epoch": 0.2805538832028898, + "grad_norm": 4.385684490203857, + "learning_rate": 7.013847080072245e-07, + "loss": 0.5196, + "step": 2330 + }, + { + "epoch": 0.2817579771222155, + "grad_norm": 4.470118045806885, + "learning_rate": 7.043949428055388e-07, + "loss": 0.502, + "step": 2340 + }, + { + "epoch": 0.28296207104154125, + "grad_norm": 4.798367023468018, + "learning_rate": 7.074051776038531e-07, + "loss": 0.5078, + "step": 2350 + }, + { + "epoch": 0.28416616496086694, + "grad_norm": 4.64969539642334, + "learning_rate": 7.104154124021673e-07, + "loss": 0.5126, + "step": 2360 + }, + { + "epoch": 0.28537025888019263, + "grad_norm": 5.035313606262207, + "learning_rate": 7.134256472004816e-07, + "loss": 0.5068, + "step": 2370 + }, + { + "epoch": 0.2865743527995184, + "grad_norm": 3.7338409423828125, + "learning_rate": 7.164358819987959e-07, + "loss": 0.4956, + "step": 2380 + }, + { + "epoch": 0.28777844671884406, + "grad_norm": 5.102356910705566, + "learning_rate": 7.194461167971101e-07, + "loss": 0.5128, + "step": 2390 + }, + { + "epoch": 0.2889825406381698, + "grad_norm": 5.0710320472717285, + "learning_rate": 7.224563515954244e-07, + "loss": 0.5064, + "step": 2400 + }, + { + "epoch": 0.2901866345574955, + "grad_norm": 5.2054667472839355, + "learning_rate": 7.254665863937387e-07, + "loss": 0.5318, + "step": 2410 + }, + { + "epoch": 0.2913907284768212, + "grad_norm": 4.590500831604004, + "learning_rate": 7.284768211920528e-07, + "loss": 0.5352, + "step": 2420 + }, + { + "epoch": 0.2925948223961469, + "grad_norm": 5.737983226776123, + "learning_rate": 7.314870559903672e-07, + "loss": 0.5047, + "step": 2430 + }, + { + "epoch": 0.2937989163154726, + "grad_norm": 5.184499263763428, + "learning_rate": 7.344972907886815e-07, + "loss": 0.4998, + "step": 2440 + }, + { + "epoch": 0.2950030102347983, + "grad_norm": 5.553317070007324, + "learning_rate": 7.375075255869959e-07, + "loss": 0.5099, + "step": 2450 + }, + { + "epoch": 0.29620710415412405, + "grad_norm": 4.864592552185059, + "learning_rate": 7.4051776038531e-07, + "loss": 0.5071, + "step": 2460 + }, + { + "epoch": 0.29741119807344973, + "grad_norm": 4.1055803298950195, + "learning_rate": 7.435279951836243e-07, + "loss": 0.4985, + "step": 2470 + }, + { + "epoch": 0.2986152919927754, + "grad_norm": 5.875371932983398, + "learning_rate": 7.465382299819386e-07, + "loss": 0.4982, + "step": 2480 + }, + { + "epoch": 0.29981938591210117, + "grad_norm": 4.417768955230713, + "learning_rate": 7.495484647802528e-07, + "loss": 0.4999, + "step": 2490 + }, + { + "epoch": 0.30102347983142685, + "grad_norm": 4.034854888916016, + "learning_rate": 7.525586995785671e-07, + "loss": 0.5063, + "step": 2500 + }, + { + "epoch": 0.30222757375075254, + "grad_norm": 4.711478233337402, + "learning_rate": 7.555689343768814e-07, + "loss": 0.5194, + "step": 2510 + }, + { + "epoch": 0.3034316676700783, + "grad_norm": 4.778373718261719, + "learning_rate": 7.585791691751956e-07, + "loss": 0.5178, + "step": 2520 + }, + { + "epoch": 0.304635761589404, + "grad_norm": 3.896817922592163, + "learning_rate": 7.615894039735099e-07, + "loss": 0.5073, + "step": 2530 + }, + { + "epoch": 0.30583985550872966, + "grad_norm": 4.729064464569092, + "learning_rate": 7.645996387718242e-07, + "loss": 0.5083, + "step": 2540 + }, + { + "epoch": 0.3070439494280554, + "grad_norm": 4.760159015655518, + "learning_rate": 7.676098735701384e-07, + "loss": 0.5108, + "step": 2550 + }, + { + "epoch": 0.3082480433473811, + "grad_norm": 4.362825870513916, + "learning_rate": 7.706201083684527e-07, + "loss": 0.5027, + "step": 2560 + }, + { + "epoch": 0.3094521372667068, + "grad_norm": 4.749810695648193, + "learning_rate": 7.73630343166767e-07, + "loss": 0.5051, + "step": 2570 + }, + { + "epoch": 0.3106562311860325, + "grad_norm": 4.157332897186279, + "learning_rate": 7.766405779650812e-07, + "loss": 0.5, + "step": 2580 + }, + { + "epoch": 0.3118603251053582, + "grad_norm": 4.272891044616699, + "learning_rate": 7.796508127633955e-07, + "loss": 0.4946, + "step": 2590 + }, + { + "epoch": 0.3130644190246839, + "grad_norm": 4.159026145935059, + "learning_rate": 7.826610475617098e-07, + "loss": 0.4992, + "step": 2600 + }, + { + "epoch": 0.31426851294400965, + "grad_norm": 5.095447063446045, + "learning_rate": 7.85671282360024e-07, + "loss": 0.4968, + "step": 2610 + }, + { + "epoch": 0.31547260686333534, + "grad_norm": 4.606817722320557, + "learning_rate": 7.886815171583383e-07, + "loss": 0.5018, + "step": 2620 + }, + { + "epoch": 0.316676700782661, + "grad_norm": 4.154166221618652, + "learning_rate": 7.916917519566526e-07, + "loss": 0.4848, + "step": 2630 + }, + { + "epoch": 0.31788079470198677, + "grad_norm": 4.749946117401123, + "learning_rate": 7.947019867549668e-07, + "loss": 0.4955, + "step": 2640 + }, + { + "epoch": 0.31908488862131246, + "grad_norm": 6.158957481384277, + "learning_rate": 7.977122215532812e-07, + "loss": 0.5088, + "step": 2650 + }, + { + "epoch": 0.32028898254063815, + "grad_norm": 4.356431484222412, + "learning_rate": 8.007224563515954e-07, + "loss": 0.5071, + "step": 2660 + }, + { + "epoch": 0.3214930764599639, + "grad_norm": 5.454282760620117, + "learning_rate": 8.037326911499096e-07, + "loss": 0.518, + "step": 2670 + }, + { + "epoch": 0.3226971703792896, + "grad_norm": 4.323178291320801, + "learning_rate": 8.06742925948224e-07, + "loss": 0.5077, + "step": 2680 + }, + { + "epoch": 0.32390126429861527, + "grad_norm": 5.352051258087158, + "learning_rate": 8.097531607465382e-07, + "loss": 0.5042, + "step": 2690 + }, + { + "epoch": 0.325105358217941, + "grad_norm": 4.680684566497803, + "learning_rate": 8.127633955448525e-07, + "loss": 0.5006, + "step": 2700 + }, + { + "epoch": 0.3263094521372667, + "grad_norm": 5.054072380065918, + "learning_rate": 8.157736303431668e-07, + "loss": 0.5005, + "step": 2710 + }, + { + "epoch": 0.32751354605659244, + "grad_norm": 4.090258598327637, + "learning_rate": 8.18783865141481e-07, + "loss": 0.4694, + "step": 2720 + }, + { + "epoch": 0.32871763997591813, + "grad_norm": 4.663838863372803, + "learning_rate": 8.217940999397953e-07, + "loss": 0.502, + "step": 2730 + }, + { + "epoch": 0.3299217338952438, + "grad_norm": 4.440493106842041, + "learning_rate": 8.248043347381095e-07, + "loss": 0.4933, + "step": 2740 + }, + { + "epoch": 0.33112582781456956, + "grad_norm": 5.184099197387695, + "learning_rate": 8.278145695364238e-07, + "loss": 0.5088, + "step": 2750 + }, + { + "epoch": 0.33232992173389525, + "grad_norm": 4.647283554077148, + "learning_rate": 8.308248043347381e-07, + "loss": 0.4909, + "step": 2760 + }, + { + "epoch": 0.33353401565322094, + "grad_norm": 4.6232500076293945, + "learning_rate": 8.338350391330523e-07, + "loss": 0.4929, + "step": 2770 + }, + { + "epoch": 0.3347381095725467, + "grad_norm": 5.234133720397949, + "learning_rate": 8.368452739313666e-07, + "loss": 0.5287, + "step": 2780 + }, + { + "epoch": 0.33594220349187237, + "grad_norm": 4.967161178588867, + "learning_rate": 8.398555087296809e-07, + "loss": 0.5041, + "step": 2790 + }, + { + "epoch": 0.33714629741119806, + "grad_norm": 4.8062591552734375, + "learning_rate": 8.428657435279951e-07, + "loss": 0.4878, + "step": 2800 + }, + { + "epoch": 0.3383503913305238, + "grad_norm": 5.188631534576416, + "learning_rate": 8.458759783263094e-07, + "loss": 0.4907, + "step": 2810 + }, + { + "epoch": 0.3395544852498495, + "grad_norm": 4.293895244598389, + "learning_rate": 8.488862131246237e-07, + "loss": 0.4952, + "step": 2820 + }, + { + "epoch": 0.3407585791691752, + "grad_norm": 5.219202041625977, + "learning_rate": 8.518964479229379e-07, + "loss": 0.5046, + "step": 2830 + }, + { + "epoch": 0.3419626730885009, + "grad_norm": 4.529453754425049, + "learning_rate": 8.549066827212522e-07, + "loss": 0.4951, + "step": 2840 + }, + { + "epoch": 0.3431667670078266, + "grad_norm": 4.706615924835205, + "learning_rate": 8.579169175195666e-07, + "loss": 0.5083, + "step": 2850 + }, + { + "epoch": 0.3443708609271523, + "grad_norm": 5.135066986083984, + "learning_rate": 8.609271523178807e-07, + "loss": 0.4823, + "step": 2860 + }, + { + "epoch": 0.34557495484647804, + "grad_norm": 4.977953910827637, + "learning_rate": 8.63937387116195e-07, + "loss": 0.4845, + "step": 2870 + }, + { + "epoch": 0.34677904876580373, + "grad_norm": 4.964434623718262, + "learning_rate": 8.669476219145094e-07, + "loss": 0.5008, + "step": 2880 + }, + { + "epoch": 0.3479831426851294, + "grad_norm": 4.28712797164917, + "learning_rate": 8.699578567128235e-07, + "loss": 0.4819, + "step": 2890 + }, + { + "epoch": 0.34918723660445516, + "grad_norm": 4.125621318817139, + "learning_rate": 8.729680915111378e-07, + "loss": 0.505, + "step": 2900 + }, + { + "epoch": 0.35039133052378085, + "grad_norm": 4.779543399810791, + "learning_rate": 8.759783263094522e-07, + "loss": 0.5002, + "step": 2910 + }, + { + "epoch": 0.35159542444310654, + "grad_norm": 4.9358320236206055, + "learning_rate": 8.789885611077663e-07, + "loss": 0.4854, + "step": 2920 + }, + { + "epoch": 0.3527995183624323, + "grad_norm": 5.439524173736572, + "learning_rate": 8.819987959060806e-07, + "loss": 0.4893, + "step": 2930 + }, + { + "epoch": 0.354003612281758, + "grad_norm": 5.939353942871094, + "learning_rate": 8.85009030704395e-07, + "loss": 0.4876, + "step": 2940 + }, + { + "epoch": 0.35520770620108366, + "grad_norm": 5.600659370422363, + "learning_rate": 8.880192655027092e-07, + "loss": 0.4916, + "step": 2950 + }, + { + "epoch": 0.3564118001204094, + "grad_norm": 6.2792134284973145, + "learning_rate": 8.910295003010234e-07, + "loss": 0.5139, + "step": 2960 + }, + { + "epoch": 0.3576158940397351, + "grad_norm": 5.060665130615234, + "learning_rate": 8.940397350993378e-07, + "loss": 0.5138, + "step": 2970 + }, + { + "epoch": 0.3588199879590608, + "grad_norm": 5.271560192108154, + "learning_rate": 8.97049969897652e-07, + "loss": 0.4971, + "step": 2980 + }, + { + "epoch": 0.3600240818783865, + "grad_norm": 4.9547014236450195, + "learning_rate": 9.000602046959662e-07, + "loss": 0.4767, + "step": 2990 + }, + { + "epoch": 0.3612281757977122, + "grad_norm": 5.039198398590088, + "learning_rate": 9.030704394942806e-07, + "loss": 0.5038, + "step": 3000 + }, + { + "epoch": 0.3624322697170379, + "grad_norm": 3.5281832218170166, + "learning_rate": 9.060806742925948e-07, + "loss": 0.4837, + "step": 3010 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 4.734562873840332, + "learning_rate": 9.09090909090909e-07, + "loss": 0.4925, + "step": 3020 + }, + { + "epoch": 0.36484045755568933, + "grad_norm": 4.400488376617432, + "learning_rate": 9.121011438892233e-07, + "loss": 0.4819, + "step": 3030 + }, + { + "epoch": 0.3660445514750151, + "grad_norm": 4.797727584838867, + "learning_rate": 9.151113786875376e-07, + "loss": 0.4779, + "step": 3040 + }, + { + "epoch": 0.36724864539434077, + "grad_norm": 4.852715492248535, + "learning_rate": 9.181216134858518e-07, + "loss": 0.4581, + "step": 3050 + }, + { + "epoch": 0.36845273931366646, + "grad_norm": 4.8324971199035645, + "learning_rate": 9.211318482841661e-07, + "loss": 0.5075, + "step": 3060 + }, + { + "epoch": 0.3696568332329922, + "grad_norm": 4.099527835845947, + "learning_rate": 9.241420830824804e-07, + "loss": 0.4926, + "step": 3070 + }, + { + "epoch": 0.3708609271523179, + "grad_norm": 4.540558338165283, + "learning_rate": 9.271523178807946e-07, + "loss": 0.4901, + "step": 3080 + }, + { + "epoch": 0.3720650210716436, + "grad_norm": 4.567551612854004, + "learning_rate": 9.301625526791089e-07, + "loss": 0.4781, + "step": 3090 + }, + { + "epoch": 0.3732691149909693, + "grad_norm": 5.362119674682617, + "learning_rate": 9.331727874774232e-07, + "loss": 0.4784, + "step": 3100 + }, + { + "epoch": 0.374473208910295, + "grad_norm": 4.974254131317139, + "learning_rate": 9.361830222757375e-07, + "loss": 0.4985, + "step": 3110 + }, + { + "epoch": 0.3756773028296207, + "grad_norm": 4.490511417388916, + "learning_rate": 9.391932570740517e-07, + "loss": 0.4619, + "step": 3120 + }, + { + "epoch": 0.37688139674894644, + "grad_norm": 4.691735744476318, + "learning_rate": 9.42203491872366e-07, + "loss": 0.4892, + "step": 3130 + }, + { + "epoch": 0.37808549066827213, + "grad_norm": 5.031266689300537, + "learning_rate": 9.452137266706803e-07, + "loss": 0.4653, + "step": 3140 + }, + { + "epoch": 0.3792895845875978, + "grad_norm": 6.112424850463867, + "learning_rate": 9.482239614689945e-07, + "loss": 0.4887, + "step": 3150 + }, + { + "epoch": 0.38049367850692356, + "grad_norm": 4.281744480133057, + "learning_rate": 9.512341962673088e-07, + "loss": 0.4828, + "step": 3160 + }, + { + "epoch": 0.38169777242624925, + "grad_norm": 4.672320365905762, + "learning_rate": 9.54244431065623e-07, + "loss": 0.4807, + "step": 3170 + }, + { + "epoch": 0.38290186634557494, + "grad_norm": 4.8247528076171875, + "learning_rate": 9.572546658639373e-07, + "loss": 0.4652, + "step": 3180 + }, + { + "epoch": 0.3841059602649007, + "grad_norm": 4.806872844696045, + "learning_rate": 9.602649006622515e-07, + "loss": 0.4687, + "step": 3190 + }, + { + "epoch": 0.38531005418422637, + "grad_norm": 4.877020835876465, + "learning_rate": 9.63275135460566e-07, + "loss": 0.4954, + "step": 3200 + }, + { + "epoch": 0.38651414810355206, + "grad_norm": 5.005871295928955, + "learning_rate": 9.662853702588802e-07, + "loss": 0.5117, + "step": 3210 + }, + { + "epoch": 0.3877182420228778, + "grad_norm": 4.2746100425720215, + "learning_rate": 9.692956050571944e-07, + "loss": 0.472, + "step": 3220 + }, + { + "epoch": 0.3889223359422035, + "grad_norm": 4.155144691467285, + "learning_rate": 9.723058398555087e-07, + "loss": 0.4882, + "step": 3230 + }, + { + "epoch": 0.3901264298615292, + "grad_norm": 4.557404041290283, + "learning_rate": 9.75316074653823e-07, + "loss": 0.4845, + "step": 3240 + }, + { + "epoch": 0.3913305237808549, + "grad_norm": 4.442798614501953, + "learning_rate": 9.783263094521371e-07, + "loss": 0.4822, + "step": 3250 + }, + { + "epoch": 0.3925346177001806, + "grad_norm": 5.363224029541016, + "learning_rate": 9.813365442504516e-07, + "loss": 0.4808, + "step": 3260 + }, + { + "epoch": 0.3937387116195063, + "grad_norm": 4.809715747833252, + "learning_rate": 9.843467790487658e-07, + "loss": 0.4834, + "step": 3270 + }, + { + "epoch": 0.39494280553883204, + "grad_norm": 4.954145431518555, + "learning_rate": 9.8735701384708e-07, + "loss": 0.4796, + "step": 3280 + }, + { + "epoch": 0.39614689945815773, + "grad_norm": 4.381477355957031, + "learning_rate": 9.903672486453943e-07, + "loss": 0.465, + "step": 3290 + }, + { + "epoch": 0.3973509933774834, + "grad_norm": 5.086960315704346, + "learning_rate": 9.933774834437085e-07, + "loss": 0.4996, + "step": 3300 + }, + { + "epoch": 0.39855508729680916, + "grad_norm": 5.4834303855896, + "learning_rate": 9.963877182420227e-07, + "loss": 0.4854, + "step": 3310 + }, + { + "epoch": 0.39975918121613485, + "grad_norm": 4.411494255065918, + "learning_rate": 9.993979530403372e-07, + "loss": 0.4882, + "step": 3320 + }, + { + "epoch": 0.40096327513546054, + "grad_norm": 3.9291751384735107, + "learning_rate": 9.999998233411383e-07, + "loss": 0.4975, + "step": 3330 + }, + { + "epoch": 0.4021673690547863, + "grad_norm": 4.288562774658203, + "learning_rate": 9.999991056647273e-07, + "loss": 0.4712, + "step": 3340 + }, + { + "epoch": 0.40337146297411197, + "grad_norm": 4.603250026702881, + "learning_rate": 9.999978359303795e-07, + "loss": 0.4933, + "step": 3350 + }, + { + "epoch": 0.4045755568934377, + "grad_norm": 4.753664970397949, + "learning_rate": 9.999960141394973e-07, + "loss": 0.4748, + "step": 3360 + }, + { + "epoch": 0.4057796508127634, + "grad_norm": 4.143571376800537, + "learning_rate": 9.99993640294092e-07, + "loss": 0.46, + "step": 3370 + }, + { + "epoch": 0.4069837447320891, + "grad_norm": 5.25679874420166, + "learning_rate": 9.99990714396784e-07, + "loss": 0.4859, + "step": 3380 + }, + { + "epoch": 0.40818783865141484, + "grad_norm": 5.903568744659424, + "learning_rate": 9.999872364508047e-07, + "loss": 0.4942, + "step": 3390 + }, + { + "epoch": 0.4093919325707405, + "grad_norm": 4.5355939865112305, + "learning_rate": 9.999832064599938e-07, + "loss": 0.4713, + "step": 3400 + }, + { + "epoch": 0.4105960264900662, + "grad_norm": 4.297218322753906, + "learning_rate": 9.999786244288008e-07, + "loss": 0.4701, + "step": 3410 + }, + { + "epoch": 0.41180012040939196, + "grad_norm": 4.364749908447266, + "learning_rate": 9.99973490362285e-07, + "loss": 0.4805, + "step": 3420 + }, + { + "epoch": 0.41300421432871764, + "grad_norm": 5.253974914550781, + "learning_rate": 9.999678042661147e-07, + "loss": 0.4728, + "step": 3430 + }, + { + "epoch": 0.41420830824804333, + "grad_norm": 3.7505037784576416, + "learning_rate": 9.999615661465685e-07, + "loss": 0.4666, + "step": 3440 + }, + { + "epoch": 0.4154124021673691, + "grad_norm": 4.56821346282959, + "learning_rate": 9.999547760105335e-07, + "loss": 0.4654, + "step": 3450 + }, + { + "epoch": 0.41661649608669477, + "grad_norm": 5.777834415435791, + "learning_rate": 9.999474338655073e-07, + "loss": 0.4708, + "step": 3460 + }, + { + "epoch": 0.41782059000602045, + "grad_norm": 4.463301181793213, + "learning_rate": 9.999395397195961e-07, + "loss": 0.4736, + "step": 3470 + }, + { + "epoch": 0.4190246839253462, + "grad_norm": 4.7559494972229, + "learning_rate": 9.999310935815165e-07, + "loss": 0.4858, + "step": 3480 + }, + { + "epoch": 0.4202287778446719, + "grad_norm": 5.451569557189941, + "learning_rate": 9.999220954605932e-07, + "loss": 0.4945, + "step": 3490 + }, + { + "epoch": 0.4214328717639976, + "grad_norm": 4.072139739990234, + "learning_rate": 9.99912545366762e-07, + "loss": 0.4685, + "step": 3500 + }, + { + "epoch": 0.4226369656833233, + "grad_norm": 5.299817085266113, + "learning_rate": 9.999024433105666e-07, + "loss": 0.4782, + "step": 3510 + }, + { + "epoch": 0.423841059602649, + "grad_norm": 4.960267543792725, + "learning_rate": 9.998917893031615e-07, + "loss": 0.4766, + "step": 3520 + }, + { + "epoch": 0.4250451535219747, + "grad_norm": 5.582713603973389, + "learning_rate": 9.998805833563096e-07, + "loss": 0.4737, + "step": 3530 + }, + { + "epoch": 0.42624924744130044, + "grad_norm": 4.434458255767822, + "learning_rate": 9.998688254823836e-07, + "loss": 0.4679, + "step": 3540 + }, + { + "epoch": 0.4274533413606261, + "grad_norm": 4.943469524383545, + "learning_rate": 9.99856515694366e-07, + "loss": 0.4754, + "step": 3550 + }, + { + "epoch": 0.4286574352799518, + "grad_norm": 5.145878314971924, + "learning_rate": 9.998436540058476e-07, + "loss": 0.4855, + "step": 3560 + }, + { + "epoch": 0.42986152919927756, + "grad_norm": 4.884524822235107, + "learning_rate": 9.998302404310296e-07, + "loss": 0.4801, + "step": 3570 + }, + { + "epoch": 0.43106562311860325, + "grad_norm": 4.950911045074463, + "learning_rate": 9.998162749847223e-07, + "loss": 0.51, + "step": 3580 + }, + { + "epoch": 0.43226971703792894, + "grad_norm": 4.5520148277282715, + "learning_rate": 9.99801757682345e-07, + "loss": 0.4887, + "step": 3590 + }, + { + "epoch": 0.4334738109572547, + "grad_norm": 5.745821952819824, + "learning_rate": 9.997866885399265e-07, + "loss": 0.4934, + "step": 3600 + }, + { + "epoch": 0.43467790487658037, + "grad_norm": 4.750070095062256, + "learning_rate": 9.997710675741049e-07, + "loss": 0.4611, + "step": 3610 + }, + { + "epoch": 0.43588199879590606, + "grad_norm": 4.3570966720581055, + "learning_rate": 9.997548948021275e-07, + "loss": 0.4819, + "step": 3620 + }, + { + "epoch": 0.4370860927152318, + "grad_norm": 3.810598373413086, + "learning_rate": 9.997381702418513e-07, + "loss": 0.4514, + "step": 3630 + }, + { + "epoch": 0.4382901866345575, + "grad_norm": 4.763775825500488, + "learning_rate": 9.997208939117418e-07, + "loss": 0.4686, + "step": 3640 + }, + { + "epoch": 0.4394942805538832, + "grad_norm": 4.3974385261535645, + "learning_rate": 9.997030658308745e-07, + "loss": 0.4763, + "step": 3650 + }, + { + "epoch": 0.4406983744732089, + "grad_norm": 4.901960372924805, + "learning_rate": 9.996846860189332e-07, + "loss": 0.4649, + "step": 3660 + }, + { + "epoch": 0.4419024683925346, + "grad_norm": 3.764139175415039, + "learning_rate": 9.996657544962118e-07, + "loss": 0.4752, + "step": 3670 + }, + { + "epoch": 0.44310656231186035, + "grad_norm": 4.972975730895996, + "learning_rate": 9.996462712836126e-07, + "loss": 0.4736, + "step": 3680 + }, + { + "epoch": 0.44431065623118604, + "grad_norm": 3.928086757659912, + "learning_rate": 9.996262364026477e-07, + "loss": 0.4939, + "step": 3690 + }, + { + "epoch": 0.44551475015051173, + "grad_norm": 4.017699718475342, + "learning_rate": 9.99605649875438e-07, + "loss": 0.4693, + "step": 3700 + }, + { + "epoch": 0.4467188440698375, + "grad_norm": 6.103999137878418, + "learning_rate": 9.995845117247129e-07, + "loss": 0.4774, + "step": 3710 + }, + { + "epoch": 0.44792293798916316, + "grad_norm": 6.031617641448975, + "learning_rate": 9.99562821973812e-07, + "loss": 0.4528, + "step": 3720 + }, + { + "epoch": 0.44912703190848885, + "grad_norm": 4.691218852996826, + "learning_rate": 9.99540580646683e-07, + "loss": 0.4646, + "step": 3730 + }, + { + "epoch": 0.4503311258278146, + "grad_norm": 4.680331230163574, + "learning_rate": 9.995177877678832e-07, + "loss": 0.469, + "step": 3740 + }, + { + "epoch": 0.4515352197471403, + "grad_norm": 4.436509132385254, + "learning_rate": 9.994944433625784e-07, + "loss": 0.4619, + "step": 3750 + }, + { + "epoch": 0.45273931366646597, + "grad_norm": 4.72512149810791, + "learning_rate": 9.994705474565435e-07, + "loss": 0.4404, + "step": 3760 + }, + { + "epoch": 0.4539434075857917, + "grad_norm": 4.427882194519043, + "learning_rate": 9.994461000761627e-07, + "loss": 0.4826, + "step": 3770 + }, + { + "epoch": 0.4551475015051174, + "grad_norm": 4.025267124176025, + "learning_rate": 9.994211012484285e-07, + "loss": 0.4671, + "step": 3780 + }, + { + "epoch": 0.4563515954244431, + "grad_norm": 5.315865516662598, + "learning_rate": 9.99395551000943e-07, + "loss": 0.4922, + "step": 3790 + }, + { + "epoch": 0.45755568934376883, + "grad_norm": 5.362889289855957, + "learning_rate": 9.993694493619162e-07, + "loss": 0.4554, + "step": 3800 + }, + { + "epoch": 0.4587597832630945, + "grad_norm": 3.8804094791412354, + "learning_rate": 9.993427963601674e-07, + "loss": 0.4558, + "step": 3810 + }, + { + "epoch": 0.4599638771824202, + "grad_norm": 3.8259241580963135, + "learning_rate": 9.99315592025125e-07, + "loss": 0.4756, + "step": 3820 + }, + { + "epoch": 0.46116797110174595, + "grad_norm": 3.806236505508423, + "learning_rate": 9.992878363868256e-07, + "loss": 0.4801, + "step": 3830 + }, + { + "epoch": 0.46237206502107164, + "grad_norm": 4.628232002258301, + "learning_rate": 9.992595294759147e-07, + "loss": 0.4953, + "step": 3840 + }, + { + "epoch": 0.46357615894039733, + "grad_norm": 4.719220161437988, + "learning_rate": 9.992306713236465e-07, + "loss": 0.4658, + "step": 3850 + }, + { + "epoch": 0.4647802528597231, + "grad_norm": 4.918371200561523, + "learning_rate": 9.992012619618838e-07, + "loss": 0.4691, + "step": 3860 + }, + { + "epoch": 0.46598434677904876, + "grad_norm": 4.425540447235107, + "learning_rate": 9.991713014230981e-07, + "loss": 0.4648, + "step": 3870 + }, + { + "epoch": 0.46718844069837445, + "grad_norm": 3.687819480895996, + "learning_rate": 9.99140789740369e-07, + "loss": 0.4714, + "step": 3880 + }, + { + "epoch": 0.4683925346177002, + "grad_norm": 4.835513591766357, + "learning_rate": 9.991097269473852e-07, + "loss": 0.4866, + "step": 3890 + }, + { + "epoch": 0.4695966285370259, + "grad_norm": 4.215537071228027, + "learning_rate": 9.990781130784437e-07, + "loss": 0.4697, + "step": 3900 + }, + { + "epoch": 0.4708007224563516, + "grad_norm": 4.371738433837891, + "learning_rate": 9.990459481684504e-07, + "loss": 0.4655, + "step": 3910 + }, + { + "epoch": 0.4720048163756773, + "grad_norm": 4.469852924346924, + "learning_rate": 9.990132322529181e-07, + "loss": 0.4416, + "step": 3920 + }, + { + "epoch": 0.473208910295003, + "grad_norm": 4.61678409576416, + "learning_rate": 9.989799653679701e-07, + "loss": 0.4625, + "step": 3930 + }, + { + "epoch": 0.4744130042143287, + "grad_norm": 5.12364387512207, + "learning_rate": 9.989461475503362e-07, + "loss": 0.4515, + "step": 3940 + }, + { + "epoch": 0.47561709813365444, + "grad_norm": 5.4315924644470215, + "learning_rate": 9.989117788373558e-07, + "loss": 0.4773, + "step": 3950 + }, + { + "epoch": 0.4768211920529801, + "grad_norm": 4.474724769592285, + "learning_rate": 9.988768592669756e-07, + "loss": 0.445, + "step": 3960 + }, + { + "epoch": 0.4780252859723058, + "grad_norm": 4.433851718902588, + "learning_rate": 9.98841388877751e-07, + "loss": 0.4667, + "step": 3970 + }, + { + "epoch": 0.47922937989163156, + "grad_norm": 4.388487815856934, + "learning_rate": 9.988053677088456e-07, + "loss": 0.443, + "step": 3980 + }, + { + "epoch": 0.48043347381095725, + "grad_norm": 5.400040149688721, + "learning_rate": 9.987687958000314e-07, + "loss": 0.4702, + "step": 3990 + }, + { + "epoch": 0.481637567730283, + "grad_norm": 4.436804294586182, + "learning_rate": 9.987316731916872e-07, + "loss": 0.4568, + "step": 4000 + }, + { + "epoch": 0.4828416616496087, + "grad_norm": 5.063580513000488, + "learning_rate": 9.986939999248014e-07, + "loss": 0.4877, + "step": 4010 + }, + { + "epoch": 0.48404575556893437, + "grad_norm": 4.696618556976318, + "learning_rate": 9.986557760409694e-07, + "loss": 0.464, + "step": 4020 + }, + { + "epoch": 0.4852498494882601, + "grad_norm": 5.019808292388916, + "learning_rate": 9.98617001582395e-07, + "loss": 0.4533, + "step": 4030 + }, + { + "epoch": 0.4864539434075858, + "grad_norm": 4.419073104858398, + "learning_rate": 9.9857767659189e-07, + "loss": 0.4416, + "step": 4040 + }, + { + "epoch": 0.4876580373269115, + "grad_norm": 4.31454610824585, + "learning_rate": 9.985378011128736e-07, + "loss": 0.458, + "step": 4050 + }, + { + "epoch": 0.48886213124623723, + "grad_norm": 5.41327428817749, + "learning_rate": 9.98497375189373e-07, + "loss": 0.4669, + "step": 4060 + }, + { + "epoch": 0.4900662251655629, + "grad_norm": 4.439949035644531, + "learning_rate": 9.98456398866023e-07, + "loss": 0.4532, + "step": 4070 + }, + { + "epoch": 0.4912703190848886, + "grad_norm": 4.076527118682861, + "learning_rate": 9.98414872188067e-07, + "loss": 0.4565, + "step": 4080 + }, + { + "epoch": 0.49247441300421435, + "grad_norm": 4.239142894744873, + "learning_rate": 9.983727952013545e-07, + "loss": 0.4686, + "step": 4090 + }, + { + "epoch": 0.49367850692354004, + "grad_norm": 4.340599060058594, + "learning_rate": 9.98330167952344e-07, + "loss": 0.4654, + "step": 4100 + }, + { + "epoch": 0.4948826008428657, + "grad_norm": 4.37545108795166, + "learning_rate": 9.982869904881007e-07, + "loss": 0.4634, + "step": 4110 + }, + { + "epoch": 0.49608669476219147, + "grad_norm": 4.235968112945557, + "learning_rate": 9.982432628562976e-07, + "loss": 0.4537, + "step": 4120 + }, + { + "epoch": 0.49729078868151716, + "grad_norm": 5.080899715423584, + "learning_rate": 9.981989851052153e-07, + "loss": 0.4675, + "step": 4130 + }, + { + "epoch": 0.49849488260084285, + "grad_norm": 4.327193260192871, + "learning_rate": 9.98154157283742e-07, + "loss": 0.4336, + "step": 4140 + }, + { + "epoch": 0.4996989765201686, + "grad_norm": 4.647739887237549, + "learning_rate": 9.981087794413721e-07, + "loss": 0.4547, + "step": 4150 + }, + { + "epoch": 0.5009030704394943, + "grad_norm": 4.411125659942627, + "learning_rate": 9.980628516282088e-07, + "loss": 0.4453, + "step": 4160 + }, + { + "epoch": 0.50210716435882, + "grad_norm": 4.8657026290893555, + "learning_rate": 9.980163738949615e-07, + "loss": 0.4714, + "step": 4170 + }, + { + "epoch": 0.5033112582781457, + "grad_norm": 4.7668776512146, + "learning_rate": 9.97969346292947e-07, + "loss": 0.4472, + "step": 4180 + }, + { + "epoch": 0.5045153521974715, + "grad_norm": 5.490717887878418, + "learning_rate": 9.979217688740895e-07, + "loss": 0.4767, + "step": 4190 + }, + { + "epoch": 0.5057194461167971, + "grad_norm": 4.896997928619385, + "learning_rate": 9.978736416909201e-07, + "loss": 0.4714, + "step": 4200 + }, + { + "epoch": 0.5069235400361228, + "grad_norm": 4.777568340301514, + "learning_rate": 9.978249647965768e-07, + "loss": 0.4608, + "step": 4210 + }, + { + "epoch": 0.5081276339554486, + "grad_norm": 4.839885711669922, + "learning_rate": 9.977757382448047e-07, + "loss": 0.4798, + "step": 4220 + }, + { + "epoch": 0.5093317278747742, + "grad_norm": 4.311272144317627, + "learning_rate": 9.977259620899557e-07, + "loss": 0.4347, + "step": 4230 + }, + { + "epoch": 0.5105358217941, + "grad_norm": 4.5723772048950195, + "learning_rate": 9.976756363869883e-07, + "loss": 0.4485, + "step": 4240 + }, + { + "epoch": 0.5117399157134257, + "grad_norm": 4.344234943389893, + "learning_rate": 9.976247611914681e-07, + "loss": 0.4623, + "step": 4250 + }, + { + "epoch": 0.5129440096327513, + "grad_norm": 4.216832160949707, + "learning_rate": 9.975733365595678e-07, + "loss": 0.4587, + "step": 4260 + }, + { + "epoch": 0.5141481035520771, + "grad_norm": 4.828461647033691, + "learning_rate": 9.975213625480656e-07, + "loss": 0.4616, + "step": 4270 + }, + { + "epoch": 0.5153521974714028, + "grad_norm": 4.608251571655273, + "learning_rate": 9.974688392143473e-07, + "loss": 0.4537, + "step": 4280 + }, + { + "epoch": 0.5165562913907285, + "grad_norm": 5.024391174316406, + "learning_rate": 9.974157666164047e-07, + "loss": 0.4596, + "step": 4290 + }, + { + "epoch": 0.5177603853100542, + "grad_norm": 4.869425296783447, + "learning_rate": 9.973621448128362e-07, + "loss": 0.468, + "step": 4300 + }, + { + "epoch": 0.5189644792293799, + "grad_norm": 4.599194526672363, + "learning_rate": 9.973079738628466e-07, + "loss": 0.4475, + "step": 4310 + }, + { + "epoch": 0.5201685731487056, + "grad_norm": 4.410305500030518, + "learning_rate": 9.972532538262473e-07, + "loss": 0.4684, + "step": 4320 + }, + { + "epoch": 0.5213726670680313, + "grad_norm": 3.9566409587860107, + "learning_rate": 9.971979847634552e-07, + "loss": 0.4472, + "step": 4330 + }, + { + "epoch": 0.5225767609873571, + "grad_norm": 4.608943462371826, + "learning_rate": 9.971421667354944e-07, + "loss": 0.4591, + "step": 4340 + }, + { + "epoch": 0.5237808549066827, + "grad_norm": 4.722293853759766, + "learning_rate": 9.97085799803994e-07, + "loss": 0.4529, + "step": 4350 + }, + { + "epoch": 0.5249849488260084, + "grad_norm": 4.868890762329102, + "learning_rate": 9.9702888403119e-07, + "loss": 0.4742, + "step": 4360 + }, + { + "epoch": 0.5261890427453342, + "grad_norm": 4.125800132751465, + "learning_rate": 9.969714194799243e-07, + "loss": 0.4501, + "step": 4370 + }, + { + "epoch": 0.5273931366646598, + "grad_norm": 4.570892810821533, + "learning_rate": 9.969134062136442e-07, + "loss": 0.4392, + "step": 4380 + }, + { + "epoch": 0.5285972305839856, + "grad_norm": 3.8944973945617676, + "learning_rate": 9.968548442964033e-07, + "loss": 0.4525, + "step": 4390 + }, + { + "epoch": 0.5298013245033113, + "grad_norm": 4.27981424331665, + "learning_rate": 9.96795733792861e-07, + "loss": 0.4607, + "step": 4400 + }, + { + "epoch": 0.5310054184226369, + "grad_norm": 4.3153300285339355, + "learning_rate": 9.96736074768282e-07, + "loss": 0.4709, + "step": 4410 + }, + { + "epoch": 0.5322095123419627, + "grad_norm": 5.543158531188965, + "learning_rate": 9.966758672885373e-07, + "loss": 0.4234, + "step": 4420 + }, + { + "epoch": 0.5334136062612884, + "grad_norm": 3.463160991668701, + "learning_rate": 9.966151114201027e-07, + "loss": 0.4684, + "step": 4430 + }, + { + "epoch": 0.534617700180614, + "grad_norm": 3.8580965995788574, + "learning_rate": 9.965538072300598e-07, + "loss": 0.4662, + "step": 4440 + }, + { + "epoch": 0.5358217940999398, + "grad_norm": 4.317717552185059, + "learning_rate": 9.96491954786096e-07, + "loss": 0.441, + "step": 4450 + }, + { + "epoch": 0.5370258880192655, + "grad_norm": 4.992043495178223, + "learning_rate": 9.964295541565035e-07, + "loss": 0.4575, + "step": 4460 + }, + { + "epoch": 0.5382299819385912, + "grad_norm": 4.042685031890869, + "learning_rate": 9.963666054101797e-07, + "loss": 0.421, + "step": 4470 + }, + { + "epoch": 0.5394340758579169, + "grad_norm": 4.4409260749816895, + "learning_rate": 9.96303108616628e-07, + "loss": 0.4684, + "step": 4480 + }, + { + "epoch": 0.5406381697772427, + "grad_norm": 4.652424335479736, + "learning_rate": 9.96239063845956e-07, + "loss": 0.4562, + "step": 4490 + }, + { + "epoch": 0.5418422636965683, + "grad_norm": 3.927960157394409, + "learning_rate": 9.961744711688765e-07, + "loss": 0.4636, + "step": 4500 + }, + { + "epoch": 0.543046357615894, + "grad_norm": 4.20367956161499, + "learning_rate": 9.961093306567074e-07, + "loss": 0.4629, + "step": 4510 + }, + { + "epoch": 0.5442504515352198, + "grad_norm": 5.0242791175842285, + "learning_rate": 9.960436423813721e-07, + "loss": 0.4699, + "step": 4520 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 4.339791297912598, + "learning_rate": 9.959774064153975e-07, + "loss": 0.4393, + "step": 4530 + }, + { + "epoch": 0.5466586393738712, + "grad_norm": 3.955888509750366, + "learning_rate": 9.959106228319164e-07, + "loss": 0.4419, + "step": 4540 + }, + { + "epoch": 0.5478627332931969, + "grad_norm": 4.508617401123047, + "learning_rate": 9.958432917046656e-07, + "loss": 0.4534, + "step": 4550 + }, + { + "epoch": 0.5490668272125225, + "grad_norm": 4.84667444229126, + "learning_rate": 9.957754131079865e-07, + "loss": 0.4621, + "step": 4560 + }, + { + "epoch": 0.5502709211318483, + "grad_norm": 4.65517520904541, + "learning_rate": 9.957069871168252e-07, + "loss": 0.4644, + "step": 4570 + }, + { + "epoch": 0.551475015051174, + "grad_norm": 4.428783416748047, + "learning_rate": 9.95638013806732e-07, + "loss": 0.4285, + "step": 4580 + }, + { + "epoch": 0.5526791089704997, + "grad_norm": 5.219538688659668, + "learning_rate": 9.955684932538615e-07, + "loss": 0.4342, + "step": 4590 + }, + { + "epoch": 0.5538832028898254, + "grad_norm": 4.356168270111084, + "learning_rate": 9.954984255349726e-07, + "loss": 0.4502, + "step": 4600 + }, + { + "epoch": 0.5550872968091511, + "grad_norm": 4.607705116271973, + "learning_rate": 9.954278107274286e-07, + "loss": 0.4397, + "step": 4610 + }, + { + "epoch": 0.5562913907284768, + "grad_norm": 4.667281150817871, + "learning_rate": 9.95356648909196e-07, + "loss": 0.4749, + "step": 4620 + }, + { + "epoch": 0.5574954846478025, + "grad_norm": 5.4144673347473145, + "learning_rate": 9.952849401588464e-07, + "loss": 0.4516, + "step": 4630 + }, + { + "epoch": 0.5586995785671283, + "grad_norm": 4.449268817901611, + "learning_rate": 9.952126845555544e-07, + "loss": 0.467, + "step": 4640 + }, + { + "epoch": 0.5599036724864539, + "grad_norm": 4.58141565322876, + "learning_rate": 9.951398821790988e-07, + "loss": 0.4674, + "step": 4650 + }, + { + "epoch": 0.5611077664057796, + "grad_norm": 4.779237747192383, + "learning_rate": 9.95066533109862e-07, + "loss": 0.4486, + "step": 4660 + }, + { + "epoch": 0.5623118603251054, + "grad_norm": 4.009070873260498, + "learning_rate": 9.949926374288298e-07, + "loss": 0.4466, + "step": 4670 + }, + { + "epoch": 0.563515954244431, + "grad_norm": 4.913680553436279, + "learning_rate": 9.949181952175922e-07, + "loss": 0.4574, + "step": 4680 + }, + { + "epoch": 0.5647200481637568, + "grad_norm": 4.114124774932861, + "learning_rate": 9.94843206558342e-07, + "loss": 0.4556, + "step": 4690 + }, + { + "epoch": 0.5659241420830825, + "grad_norm": 4.208637237548828, + "learning_rate": 9.94767671533875e-07, + "loss": 0.4446, + "step": 4700 + }, + { + "epoch": 0.5671282360024081, + "grad_norm": 4.362401962280273, + "learning_rate": 9.946915902275914e-07, + "loss": 0.4591, + "step": 4710 + }, + { + "epoch": 0.5683323299217339, + "grad_norm": 4.419969081878662, + "learning_rate": 9.946149627234939e-07, + "loss": 0.4352, + "step": 4720 + }, + { + "epoch": 0.5695364238410596, + "grad_norm": 5.162231922149658, + "learning_rate": 9.94537789106188e-07, + "loss": 0.4613, + "step": 4730 + }, + { + "epoch": 0.5707405177603853, + "grad_norm": 4.270598411560059, + "learning_rate": 9.944600694608825e-07, + "loss": 0.4628, + "step": 4740 + }, + { + "epoch": 0.571944611679711, + "grad_norm": 4.181495666503906, + "learning_rate": 9.943818038733891e-07, + "loss": 0.4391, + "step": 4750 + }, + { + "epoch": 0.5731487055990367, + "grad_norm": 4.3339033126831055, + "learning_rate": 9.943029924301225e-07, + "loss": 0.4406, + "step": 4760 + }, + { + "epoch": 0.5743527995183624, + "grad_norm": 4.909811496734619, + "learning_rate": 9.942236352180996e-07, + "loss": 0.4575, + "step": 4770 + }, + { + "epoch": 0.5755568934376881, + "grad_norm": 4.58059549331665, + "learning_rate": 9.941437323249398e-07, + "loss": 0.4613, + "step": 4780 + }, + { + "epoch": 0.5767609873570139, + "grad_norm": 3.9194531440734863, + "learning_rate": 9.94063283838866e-07, + "loss": 0.4449, + "step": 4790 + }, + { + "epoch": 0.5779650812763396, + "grad_norm": 4.602609634399414, + "learning_rate": 9.93982289848702e-07, + "loss": 0.4622, + "step": 4800 + }, + { + "epoch": 0.5791691751956652, + "grad_norm": 4.630181789398193, + "learning_rate": 9.939007504438754e-07, + "loss": 0.442, + "step": 4810 + }, + { + "epoch": 0.580373269114991, + "grad_norm": 3.903799057006836, + "learning_rate": 9.938186657144149e-07, + "loss": 0.4624, + "step": 4820 + }, + { + "epoch": 0.5815773630343167, + "grad_norm": 5.423624515533447, + "learning_rate": 9.937360357509522e-07, + "loss": 0.4372, + "step": 4830 + }, + { + "epoch": 0.5827814569536424, + "grad_norm": 4.571367263793945, + "learning_rate": 9.936528606447198e-07, + "loss": 0.4521, + "step": 4840 + }, + { + "epoch": 0.5839855508729681, + "grad_norm": 3.8848462104797363, + "learning_rate": 9.935691404875534e-07, + "loss": 0.4399, + "step": 4850 + }, + { + "epoch": 0.5851896447922939, + "grad_norm": 4.659217357635498, + "learning_rate": 9.934848753718896e-07, + "loss": 0.4345, + "step": 4860 + }, + { + "epoch": 0.5863937387116195, + "grad_norm": 5.5009026527404785, + "learning_rate": 9.934000653907672e-07, + "loss": 0.4173, + "step": 4870 + }, + { + "epoch": 0.5875978326309452, + "grad_norm": 3.984834671020508, + "learning_rate": 9.933147106378263e-07, + "loss": 0.4354, + "step": 4880 + }, + { + "epoch": 0.588801926550271, + "grad_norm": 4.0750346183776855, + "learning_rate": 9.932288112073086e-07, + "loss": 0.4447, + "step": 4890 + }, + { + "epoch": 0.5900060204695966, + "grad_norm": 4.871407985687256, + "learning_rate": 9.931423671940575e-07, + "loss": 0.4501, + "step": 4900 + }, + { + "epoch": 0.5912101143889223, + "grad_norm": 4.388524055480957, + "learning_rate": 9.93055378693517e-07, + "loss": 0.4421, + "step": 4910 + }, + { + "epoch": 0.5924142083082481, + "grad_norm": 4.511969566345215, + "learning_rate": 9.929678458017328e-07, + "loss": 0.4431, + "step": 4920 + }, + { + "epoch": 0.5936183022275737, + "grad_norm": 4.788571834564209, + "learning_rate": 9.928797686153514e-07, + "loss": 0.4621, + "step": 4930 + }, + { + "epoch": 0.5948223961468995, + "grad_norm": 5.144417762756348, + "learning_rate": 9.927911472316205e-07, + "loss": 0.4418, + "step": 4940 + }, + { + "epoch": 0.5960264900662252, + "grad_norm": 4.649743556976318, + "learning_rate": 9.927019817483887e-07, + "loss": 0.4639, + "step": 4950 + }, + { + "epoch": 0.5972305839855508, + "grad_norm": 4.76192045211792, + "learning_rate": 9.92612272264105e-07, + "loss": 0.4646, + "step": 4960 + }, + { + "epoch": 0.5984346779048766, + "grad_norm": 4.137574195861816, + "learning_rate": 9.925220188778193e-07, + "loss": 0.4537, + "step": 4970 + }, + { + "epoch": 0.5996387718242023, + "grad_norm": 4.616219997406006, + "learning_rate": 9.924312216891819e-07, + "loss": 0.4451, + "step": 4980 + }, + { + "epoch": 0.600842865743528, + "grad_norm": 4.623941421508789, + "learning_rate": 9.923398807984438e-07, + "loss": 0.4441, + "step": 4990 + }, + { + "epoch": 0.6020469596628537, + "grad_norm": 4.540246486663818, + "learning_rate": 9.92247996306456e-07, + "loss": 0.4477, + "step": 5000 + }, + { + "epoch": 0.6032510535821795, + "grad_norm": 4.742766380310059, + "learning_rate": 9.921555683146695e-07, + "loss": 0.4672, + "step": 5010 + }, + { + "epoch": 0.6044551475015051, + "grad_norm": 5.316002368927002, + "learning_rate": 9.920625969251364e-07, + "loss": 0.4593, + "step": 5020 + }, + { + "epoch": 0.6056592414208308, + "grad_norm": 4.386168003082275, + "learning_rate": 9.919690822405074e-07, + "loss": 0.4438, + "step": 5030 + }, + { + "epoch": 0.6068633353401566, + "grad_norm": 3.9734067916870117, + "learning_rate": 9.91875024364034e-07, + "loss": 0.4428, + "step": 5040 + }, + { + "epoch": 0.6080674292594822, + "grad_norm": 4.917031764984131, + "learning_rate": 9.917804233995673e-07, + "loss": 0.4622, + "step": 5050 + }, + { + "epoch": 0.609271523178808, + "grad_norm": 4.690892696380615, + "learning_rate": 9.916852794515575e-07, + "loss": 0.4513, + "step": 5060 + }, + { + "epoch": 0.6104756170981337, + "grad_norm": 4.1330952644348145, + "learning_rate": 9.915895926250552e-07, + "loss": 0.4523, + "step": 5070 + }, + { + "epoch": 0.6116797110174593, + "grad_norm": 4.932434558868408, + "learning_rate": 9.9149336302571e-07, + "loss": 0.4407, + "step": 5080 + }, + { + "epoch": 0.6128838049367851, + "grad_norm": 4.421885967254639, + "learning_rate": 9.913965907597702e-07, + "loss": 0.4332, + "step": 5090 + }, + { + "epoch": 0.6140878988561108, + "grad_norm": 5.199044704437256, + "learning_rate": 9.91299275934084e-07, + "loss": 0.426, + "step": 5100 + }, + { + "epoch": 0.6152919927754364, + "grad_norm": 4.189499855041504, + "learning_rate": 9.912014186560984e-07, + "loss": 0.4326, + "step": 5110 + }, + { + "epoch": 0.6164960866947622, + "grad_norm": 4.297112464904785, + "learning_rate": 9.911030190338597e-07, + "loss": 0.4622, + "step": 5120 + }, + { + "epoch": 0.6177001806140879, + "grad_norm": 3.9968087673187256, + "learning_rate": 9.910040771760122e-07, + "loss": 0.447, + "step": 5130 + }, + { + "epoch": 0.6189042745334136, + "grad_norm": 4.857995510101318, + "learning_rate": 9.909045931917998e-07, + "loss": 0.4343, + "step": 5140 + }, + { + "epoch": 0.6201083684527393, + "grad_norm": 3.741711378097534, + "learning_rate": 9.908045671910642e-07, + "loss": 0.4366, + "step": 5150 + }, + { + "epoch": 0.621312462372065, + "grad_norm": 4.424086093902588, + "learning_rate": 9.907039992842461e-07, + "loss": 0.448, + "step": 5160 + }, + { + "epoch": 0.6225165562913907, + "grad_norm": 5.499582767486572, + "learning_rate": 9.906028895823842e-07, + "loss": 0.4546, + "step": 5170 + }, + { + "epoch": 0.6237206502107164, + "grad_norm": 4.836984634399414, + "learning_rate": 9.905012381971157e-07, + "loss": 0.4605, + "step": 5180 + }, + { + "epoch": 0.6249247441300422, + "grad_norm": 4.31553316116333, + "learning_rate": 9.903990452406756e-07, + "loss": 0.4302, + "step": 5190 + }, + { + "epoch": 0.6261288380493678, + "grad_norm": 4.909146785736084, + "learning_rate": 9.902963108258968e-07, + "loss": 0.4445, + "step": 5200 + }, + { + "epoch": 0.6273329319686936, + "grad_norm": 4.295082092285156, + "learning_rate": 9.901930350662103e-07, + "loss": 0.4364, + "step": 5210 + }, + { + "epoch": 0.6285370258880193, + "grad_norm": 4.154002666473389, + "learning_rate": 9.90089218075645e-07, + "loss": 0.4526, + "step": 5220 + }, + { + "epoch": 0.6297411198073449, + "grad_norm": 4.30592679977417, + "learning_rate": 9.89984859968827e-07, + "loss": 0.4442, + "step": 5230 + }, + { + "epoch": 0.6309452137266707, + "grad_norm": 5.334674835205078, + "learning_rate": 9.898799608609795e-07, + "loss": 0.4415, + "step": 5240 + }, + { + "epoch": 0.6321493076459964, + "grad_norm": 4.136261940002441, + "learning_rate": 9.897745208679239e-07, + "loss": 0.4442, + "step": 5250 + }, + { + "epoch": 0.633353401565322, + "grad_norm": 4.585081577301025, + "learning_rate": 9.896685401060782e-07, + "loss": 0.4565, + "step": 5260 + }, + { + "epoch": 0.6345574954846478, + "grad_norm": 4.742111682891846, + "learning_rate": 9.895620186924578e-07, + "loss": 0.4393, + "step": 5270 + }, + { + "epoch": 0.6357615894039735, + "grad_norm": 3.9798941612243652, + "learning_rate": 9.894549567446748e-07, + "loss": 0.4255, + "step": 5280 + }, + { + "epoch": 0.6369656833232992, + "grad_norm": 4.722369194030762, + "learning_rate": 9.893473543809383e-07, + "loss": 0.4377, + "step": 5290 + }, + { + "epoch": 0.6381697772426249, + "grad_norm": 4.399467945098877, + "learning_rate": 9.892392117200536e-07, + "loss": 0.4215, + "step": 5300 + }, + { + "epoch": 0.6393738711619507, + "grad_norm": 4.718751430511475, + "learning_rate": 9.891305288814235e-07, + "loss": 0.4372, + "step": 5310 + }, + { + "epoch": 0.6405779650812763, + "grad_norm": 4.376132488250732, + "learning_rate": 9.890213059850465e-07, + "loss": 0.4567, + "step": 5320 + }, + { + "epoch": 0.641782059000602, + "grad_norm": 5.186975955963135, + "learning_rate": 9.889115431515173e-07, + "loss": 0.4414, + "step": 5330 + }, + { + "epoch": 0.6429861529199278, + "grad_norm": 4.560245037078857, + "learning_rate": 9.888012405020271e-07, + "loss": 0.4329, + "step": 5340 + }, + { + "epoch": 0.6441902468392534, + "grad_norm": 5.553184986114502, + "learning_rate": 9.886903981583632e-07, + "loss": 0.4472, + "step": 5350 + }, + { + "epoch": 0.6453943407585792, + "grad_norm": 5.126540660858154, + "learning_rate": 9.885790162429086e-07, + "loss": 0.4577, + "step": 5360 + }, + { + "epoch": 0.6465984346779049, + "grad_norm": 5.031693935394287, + "learning_rate": 9.884670948786417e-07, + "loss": 0.4608, + "step": 5370 + }, + { + "epoch": 0.6478025285972305, + "grad_norm": 4.265883445739746, + "learning_rate": 9.883546341891373e-07, + "loss": 0.4335, + "step": 5380 + }, + { + "epoch": 0.6490066225165563, + "grad_norm": 3.7793495655059814, + "learning_rate": 9.88241634298565e-07, + "loss": 0.4481, + "step": 5390 + }, + { + "epoch": 0.650210716435882, + "grad_norm": 4.184829235076904, + "learning_rate": 9.881280953316903e-07, + "loss": 0.4351, + "step": 5400 + }, + { + "epoch": 0.6514148103552077, + "grad_norm": 5.431835174560547, + "learning_rate": 9.880140174138735e-07, + "loss": 0.4739, + "step": 5410 + }, + { + "epoch": 0.6526189042745334, + "grad_norm": 5.218166828155518, + "learning_rate": 9.878994006710695e-07, + "loss": 0.4547, + "step": 5420 + }, + { + "epoch": 0.6538229981938591, + "grad_norm": 5.319456100463867, + "learning_rate": 9.877842452298293e-07, + "loss": 0.453, + "step": 5430 + }, + { + "epoch": 0.6550270921131849, + "grad_norm": 4.373801231384277, + "learning_rate": 9.876685512172979e-07, + "loss": 0.4245, + "step": 5440 + }, + { + "epoch": 0.6562311860325105, + "grad_norm": 4.274784088134766, + "learning_rate": 9.875523187612153e-07, + "loss": 0.4327, + "step": 5450 + }, + { + "epoch": 0.6574352799518363, + "grad_norm": 5.235876560211182, + "learning_rate": 9.874355479899157e-07, + "loss": 0.4365, + "step": 5460 + }, + { + "epoch": 0.658639373871162, + "grad_norm": 4.505414962768555, + "learning_rate": 9.873182390323275e-07, + "loss": 0.4236, + "step": 5470 + }, + { + "epoch": 0.6598434677904876, + "grad_norm": 5.843977451324463, + "learning_rate": 9.87200392017974e-07, + "loss": 0.4482, + "step": 5480 + }, + { + "epoch": 0.6610475617098134, + "grad_norm": 4.754218578338623, + "learning_rate": 9.870820070769723e-07, + "loss": 0.4526, + "step": 5490 + }, + { + "epoch": 0.6622516556291391, + "grad_norm": 4.734755992889404, + "learning_rate": 9.869630843400329e-07, + "loss": 0.4286, + "step": 5500 + }, + { + "epoch": 0.6634557495484648, + "grad_norm": 4.781942367553711, + "learning_rate": 9.868436239384608e-07, + "loss": 0.4395, + "step": 5510 + }, + { + "epoch": 0.6646598434677905, + "grad_norm": 4.710615634918213, + "learning_rate": 9.86723626004154e-07, + "loss": 0.4437, + "step": 5520 + }, + { + "epoch": 0.6658639373871162, + "grad_norm": 3.9797275066375732, + "learning_rate": 9.86603090669605e-07, + "loss": 0.4285, + "step": 5530 + }, + { + "epoch": 0.6670680313064419, + "grad_norm": 5.289978981018066, + "learning_rate": 9.864820180678984e-07, + "loss": 0.4482, + "step": 5540 + }, + { + "epoch": 0.6682721252257676, + "grad_norm": 3.6335768699645996, + "learning_rate": 9.86360408332713e-07, + "loss": 0.4578, + "step": 5550 + }, + { + "epoch": 0.6694762191450934, + "grad_norm": 3.998011589050293, + "learning_rate": 9.862382615983201e-07, + "loss": 0.439, + "step": 5560 + }, + { + "epoch": 0.670680313064419, + "grad_norm": 4.6308369636535645, + "learning_rate": 9.861155779995843e-07, + "loss": 0.4416, + "step": 5570 + }, + { + "epoch": 0.6718844069837447, + "grad_norm": 4.869227409362793, + "learning_rate": 9.859923576719623e-07, + "loss": 0.4271, + "step": 5580 + }, + { + "epoch": 0.6730885009030705, + "grad_norm": 4.426019668579102, + "learning_rate": 9.858686007515043e-07, + "loss": 0.424, + "step": 5590 + }, + { + "epoch": 0.6742925948223961, + "grad_norm": 4.659002304077148, + "learning_rate": 9.857443073748526e-07, + "loss": 0.4419, + "step": 5600 + }, + { + "epoch": 0.6754966887417219, + "grad_norm": 3.8600122928619385, + "learning_rate": 9.856194776792412e-07, + "loss": 0.4397, + "step": 5610 + }, + { + "epoch": 0.6767007826610476, + "grad_norm": 4.6182756423950195, + "learning_rate": 9.854941118024973e-07, + "loss": 0.454, + "step": 5620 + }, + { + "epoch": 0.6779048765803732, + "grad_norm": 4.149092674255371, + "learning_rate": 9.853682098830392e-07, + "loss": 0.426, + "step": 5630 + }, + { + "epoch": 0.679108970499699, + "grad_norm": 4.583498954772949, + "learning_rate": 9.852417720598778e-07, + "loss": 0.4226, + "step": 5640 + }, + { + "epoch": 0.6803130644190247, + "grad_norm": 4.789090633392334, + "learning_rate": 9.851147984726152e-07, + "loss": 0.4506, + "step": 5650 + }, + { + "epoch": 0.6815171583383504, + "grad_norm": 3.850926160812378, + "learning_rate": 9.849872892614452e-07, + "loss": 0.4149, + "step": 5660 + }, + { + "epoch": 0.6827212522576761, + "grad_norm": 4.576216697692871, + "learning_rate": 9.848592445671532e-07, + "loss": 0.4364, + "step": 5670 + }, + { + "epoch": 0.6839253461770018, + "grad_norm": 5.302231311798096, + "learning_rate": 9.847306645311152e-07, + "loss": 0.4529, + "step": 5680 + }, + { + "epoch": 0.6851294400963275, + "grad_norm": 4.6318864822387695, + "learning_rate": 9.846015492952993e-07, + "loss": 0.4299, + "step": 5690 + }, + { + "epoch": 0.6863335340156532, + "grad_norm": 4.18743896484375, + "learning_rate": 9.844718990022634e-07, + "loss": 0.4567, + "step": 5700 + }, + { + "epoch": 0.687537627934979, + "grad_norm": 4.45042610168457, + "learning_rate": 9.84341713795157e-07, + "loss": 0.4461, + "step": 5710 + }, + { + "epoch": 0.6887417218543046, + "grad_norm": 4.0155415534973145, + "learning_rate": 9.842109938177197e-07, + "loss": 0.4422, + "step": 5720 + }, + { + "epoch": 0.6899458157736303, + "grad_norm": 4.72194242477417, + "learning_rate": 9.840797392142819e-07, + "loss": 0.4499, + "step": 5730 + }, + { + "epoch": 0.6911499096929561, + "grad_norm": 4.1018595695495605, + "learning_rate": 9.83947950129764e-07, + "loss": 0.4305, + "step": 5740 + }, + { + "epoch": 0.6923540036122817, + "grad_norm": 4.466518402099609, + "learning_rate": 9.838156267096772e-07, + "loss": 0.437, + "step": 5750 + }, + { + "epoch": 0.6935580975316075, + "grad_norm": 4.084195137023926, + "learning_rate": 9.836827691001215e-07, + "loss": 0.4571, + "step": 5760 + }, + { + "epoch": 0.6947621914509332, + "grad_norm": 4.3810319900512695, + "learning_rate": 9.835493774477876e-07, + "loss": 0.4358, + "step": 5770 + }, + { + "epoch": 0.6959662853702588, + "grad_norm": 4.7473464012146, + "learning_rate": 9.834154518999558e-07, + "loss": 0.4307, + "step": 5780 + }, + { + "epoch": 0.6971703792895846, + "grad_norm": 4.240455627441406, + "learning_rate": 9.832809926044953e-07, + "loss": 0.4456, + "step": 5790 + }, + { + "epoch": 0.6983744732089103, + "grad_norm": 4.3158087730407715, + "learning_rate": 9.831459997098653e-07, + "loss": 0.4268, + "step": 5800 + }, + { + "epoch": 0.699578567128236, + "grad_norm": 4.3610005378723145, + "learning_rate": 9.83010473365114e-07, + "loss": 0.4334, + "step": 5810 + }, + { + "epoch": 0.7007826610475617, + "grad_norm": 4.417696952819824, + "learning_rate": 9.828744137198778e-07, + "loss": 0.4451, + "step": 5820 + }, + { + "epoch": 0.7019867549668874, + "grad_norm": 4.091536998748779, + "learning_rate": 9.827378209243833e-07, + "loss": 0.4277, + "step": 5830 + }, + { + "epoch": 0.7031908488862131, + "grad_norm": 5.2131028175354, + "learning_rate": 9.826006951294448e-07, + "loss": 0.4353, + "step": 5840 + }, + { + "epoch": 0.7043949428055388, + "grad_norm": 4.724157810211182, + "learning_rate": 9.824630364864653e-07, + "loss": 0.4379, + "step": 5850 + }, + { + "epoch": 0.7055990367248646, + "grad_norm": 3.924499034881592, + "learning_rate": 9.82324845147436e-07, + "loss": 0.4341, + "step": 5860 + }, + { + "epoch": 0.7068031306441902, + "grad_norm": 3.9886951446533203, + "learning_rate": 9.821861212649367e-07, + "loss": 0.4458, + "step": 5870 + }, + { + "epoch": 0.708007224563516, + "grad_norm": 5.176059246063232, + "learning_rate": 9.820468649921348e-07, + "loss": 0.4277, + "step": 5880 + }, + { + "epoch": 0.7092113184828417, + "grad_norm": 5.795221328735352, + "learning_rate": 9.819070764827856e-07, + "loss": 0.4608, + "step": 5890 + }, + { + "epoch": 0.7104154124021673, + "grad_norm": 4.0651702880859375, + "learning_rate": 9.81766755891232e-07, + "loss": 0.4349, + "step": 5900 + }, + { + "epoch": 0.7116195063214931, + "grad_norm": 4.822697162628174, + "learning_rate": 9.816259033724051e-07, + "loss": 0.4368, + "step": 5910 + }, + { + "epoch": 0.7128236002408188, + "grad_norm": 3.429680585861206, + "learning_rate": 9.814845190818218e-07, + "loss": 0.4119, + "step": 5920 + }, + { + "epoch": 0.7140276941601444, + "grad_norm": 4.649044513702393, + "learning_rate": 9.813426031755873e-07, + "loss": 0.431, + "step": 5930 + }, + { + "epoch": 0.7152317880794702, + "grad_norm": 4.576180458068848, + "learning_rate": 9.812001558103937e-07, + "loss": 0.4478, + "step": 5940 + }, + { + "epoch": 0.7164358819987959, + "grad_norm": 4.996614933013916, + "learning_rate": 9.810571771435196e-07, + "loss": 0.4013, + "step": 5950 + }, + { + "epoch": 0.7176399759181216, + "grad_norm": 5.006197929382324, + "learning_rate": 9.809136673328305e-07, + "loss": 0.4275, + "step": 5960 + }, + { + "epoch": 0.7188440698374473, + "grad_norm": 3.766942024230957, + "learning_rate": 9.807696265367776e-07, + "loss": 0.4377, + "step": 5970 + }, + { + "epoch": 0.720048163756773, + "grad_norm": 4.086816787719727, + "learning_rate": 9.806250549143992e-07, + "loss": 0.4384, + "step": 5980 + }, + { + "epoch": 0.7212522576760987, + "grad_norm": 5.5871734619140625, + "learning_rate": 9.804799526253196e-07, + "loss": 0.4511, + "step": 5990 + }, + { + "epoch": 0.7224563515954244, + "grad_norm": 4.023412704467773, + "learning_rate": 9.803343198297484e-07, + "loss": 0.4446, + "step": 6000 + }, + { + "epoch": 0.7236604455147502, + "grad_norm": 4.708857536315918, + "learning_rate": 9.80188156688482e-07, + "loss": 0.4395, + "step": 6010 + }, + { + "epoch": 0.7248645394340758, + "grad_norm": 3.879977226257324, + "learning_rate": 9.80041463362901e-07, + "loss": 0.4434, + "step": 6020 + }, + { + "epoch": 0.7260686333534015, + "grad_norm": 4.743607997894287, + "learning_rate": 9.798942400149726e-07, + "loss": 0.4365, + "step": 6030 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 3.6438701152801514, + "learning_rate": 9.797464868072486e-07, + "loss": 0.447, + "step": 6040 + }, + { + "epoch": 0.7284768211920529, + "grad_norm": 4.472813129425049, + "learning_rate": 9.79598203902866e-07, + "loss": 0.443, + "step": 6050 + }, + { + "epoch": 0.7296809151113787, + "grad_norm": 5.6175312995910645, + "learning_rate": 9.794493914655467e-07, + "loss": 0.4207, + "step": 6060 + }, + { + "epoch": 0.7308850090307044, + "grad_norm": 4.9606404304504395, + "learning_rate": 9.793000496595966e-07, + "loss": 0.4279, + "step": 6070 + }, + { + "epoch": 0.7320891029500302, + "grad_norm": 4.130514144897461, + "learning_rate": 9.791501786499074e-07, + "loss": 0.4183, + "step": 6080 + }, + { + "epoch": 0.7332931968693558, + "grad_norm": 2.9547371864318848, + "learning_rate": 9.78999778601954e-07, + "loss": 0.4038, + "step": 6090 + }, + { + "epoch": 0.7344972907886815, + "grad_norm": 4.06984281539917, + "learning_rate": 9.788488496817958e-07, + "loss": 0.4333, + "step": 6100 + }, + { + "epoch": 0.7357013847080073, + "grad_norm": 3.900606870651245, + "learning_rate": 9.78697392056076e-07, + "loss": 0.418, + "step": 6110 + }, + { + "epoch": 0.7369054786273329, + "grad_norm": 4.396324157714844, + "learning_rate": 9.78545405892022e-07, + "loss": 0.435, + "step": 6120 + }, + { + "epoch": 0.7381095725466587, + "grad_norm": 4.068949222564697, + "learning_rate": 9.78392891357444e-07, + "loss": 0.4138, + "step": 6130 + }, + { + "epoch": 0.7393136664659844, + "grad_norm": 4.090792655944824, + "learning_rate": 9.782398486207364e-07, + "loss": 0.4106, + "step": 6140 + }, + { + "epoch": 0.74051776038531, + "grad_norm": 5.222830295562744, + "learning_rate": 9.780862778508762e-07, + "loss": 0.4534, + "step": 6150 + }, + { + "epoch": 0.7417218543046358, + "grad_norm": 3.9300661087036133, + "learning_rate": 9.779321792174238e-07, + "loss": 0.4436, + "step": 6160 + }, + { + "epoch": 0.7429259482239615, + "grad_norm": 4.139192581176758, + "learning_rate": 9.77777552890522e-07, + "loss": 0.4384, + "step": 6170 + }, + { + "epoch": 0.7441300421432872, + "grad_norm": 4.677849292755127, + "learning_rate": 9.776223990408969e-07, + "loss": 0.4338, + "step": 6180 + }, + { + "epoch": 0.7453341360626129, + "grad_norm": 4.7174391746521, + "learning_rate": 9.77466717839856e-07, + "loss": 0.4265, + "step": 6190 + }, + { + "epoch": 0.7465382299819386, + "grad_norm": 4.314562797546387, + "learning_rate": 9.773105094592903e-07, + "loss": 0.4389, + "step": 6200 + }, + { + "epoch": 0.7477423239012643, + "grad_norm": 4.679368495941162, + "learning_rate": 9.77153774071672e-07, + "loss": 0.4177, + "step": 6210 + }, + { + "epoch": 0.74894641782059, + "grad_norm": 4.037609577178955, + "learning_rate": 9.769965118500554e-07, + "loss": 0.4376, + "step": 6220 + }, + { + "epoch": 0.7501505117399158, + "grad_norm": 4.8901448249816895, + "learning_rate": 9.768387229680765e-07, + "loss": 0.4597, + "step": 6230 + }, + { + "epoch": 0.7513546056592414, + "grad_norm": 4.4093122482299805, + "learning_rate": 9.76680407599953e-07, + "loss": 0.4332, + "step": 6240 + }, + { + "epoch": 0.7525586995785671, + "grad_norm": 4.720508575439453, + "learning_rate": 9.765215659204837e-07, + "loss": 0.4579, + "step": 6250 + }, + { + "epoch": 0.7537627934978929, + "grad_norm": 4.316104412078857, + "learning_rate": 9.763621981050486e-07, + "loss": 0.4499, + "step": 6260 + }, + { + "epoch": 0.7549668874172185, + "grad_norm": 4.805814743041992, + "learning_rate": 9.762023043296082e-07, + "loss": 0.4229, + "step": 6270 + }, + { + "epoch": 0.7561709813365443, + "grad_norm": 4.259012699127197, + "learning_rate": 9.760418847707042e-07, + "loss": 0.4307, + "step": 6280 + }, + { + "epoch": 0.75737507525587, + "grad_norm": 4.74151086807251, + "learning_rate": 9.75880939605459e-07, + "loss": 0.4039, + "step": 6290 + }, + { + "epoch": 0.7585791691751956, + "grad_norm": 4.7510294914245605, + "learning_rate": 9.757194690115747e-07, + "loss": 0.4302, + "step": 6300 + }, + { + "epoch": 0.7597832630945214, + "grad_norm": 5.057920455932617, + "learning_rate": 9.75557473167334e-07, + "loss": 0.4196, + "step": 6310 + }, + { + "epoch": 0.7609873570138471, + "grad_norm": 4.428061485290527, + "learning_rate": 9.753949522515992e-07, + "loss": 0.4271, + "step": 6320 + }, + { + "epoch": 0.7621914509331728, + "grad_norm": 4.023929595947266, + "learning_rate": 9.75231906443813e-07, + "loss": 0.4125, + "step": 6330 + }, + { + "epoch": 0.7633955448524985, + "grad_norm": 4.456701755523682, + "learning_rate": 9.75068335923997e-07, + "loss": 0.4177, + "step": 6340 + }, + { + "epoch": 0.7645996387718242, + "grad_norm": 4.046926975250244, + "learning_rate": 9.749042408727517e-07, + "loss": 0.4172, + "step": 6350 + }, + { + "epoch": 0.7658037326911499, + "grad_norm": 4.5811944007873535, + "learning_rate": 9.747396214712584e-07, + "loss": 0.4165, + "step": 6360 + }, + { + "epoch": 0.7670078266104756, + "grad_norm": 3.6832375526428223, + "learning_rate": 9.745744779012757e-07, + "loss": 0.4183, + "step": 6370 + }, + { + "epoch": 0.7682119205298014, + "grad_norm": 4.535373210906982, + "learning_rate": 9.744088103451417e-07, + "loss": 0.4205, + "step": 6380 + }, + { + "epoch": 0.769416014449127, + "grad_norm": 4.3140363693237305, + "learning_rate": 9.742426189857729e-07, + "loss": 0.4414, + "step": 6390 + }, + { + "epoch": 0.7706201083684527, + "grad_norm": 4.968809604644775, + "learning_rate": 9.74075904006664e-07, + "loss": 0.4421, + "step": 6400 + }, + { + "epoch": 0.7718242022877785, + "grad_norm": 4.488393783569336, + "learning_rate": 9.739086655918883e-07, + "loss": 0.441, + "step": 6410 + }, + { + "epoch": 0.7730282962071041, + "grad_norm": 4.255595684051514, + "learning_rate": 9.737409039260966e-07, + "loss": 0.4211, + "step": 6420 + }, + { + "epoch": 0.7742323901264299, + "grad_norm": 4.285024642944336, + "learning_rate": 9.735726191945175e-07, + "loss": 0.42, + "step": 6430 + }, + { + "epoch": 0.7754364840457556, + "grad_norm": 4.8813347816467285, + "learning_rate": 9.734038115829571e-07, + "loss": 0.433, + "step": 6440 + }, + { + "epoch": 0.7766405779650812, + "grad_norm": 3.9893128871917725, + "learning_rate": 9.732344812777987e-07, + "loss": 0.3902, + "step": 6450 + }, + { + "epoch": 0.777844671884407, + "grad_norm": 4.2948784828186035, + "learning_rate": 9.730646284660035e-07, + "loss": 0.4094, + "step": 6460 + }, + { + "epoch": 0.7790487658037327, + "grad_norm": 4.328617572784424, + "learning_rate": 9.728942533351087e-07, + "loss": 0.4412, + "step": 6470 + }, + { + "epoch": 0.7802528597230584, + "grad_norm": 4.67041015625, + "learning_rate": 9.727233560732286e-07, + "loss": 0.4157, + "step": 6480 + }, + { + "epoch": 0.7814569536423841, + "grad_norm": 4.249061584472656, + "learning_rate": 9.725519368690538e-07, + "loss": 0.4398, + "step": 6490 + }, + { + "epoch": 0.7826610475617098, + "grad_norm": 5.444673538208008, + "learning_rate": 9.723799959118513e-07, + "loss": 0.4299, + "step": 6500 + }, + { + "epoch": 0.7838651414810355, + "grad_norm": 4.813880920410156, + "learning_rate": 9.722075333914642e-07, + "loss": 0.4483, + "step": 6510 + }, + { + "epoch": 0.7850692354003612, + "grad_norm": 3.9406328201293945, + "learning_rate": 9.720345494983116e-07, + "loss": 0.4101, + "step": 6520 + }, + { + "epoch": 0.786273329319687, + "grad_norm": 5.169934272766113, + "learning_rate": 9.718610444233878e-07, + "loss": 0.4284, + "step": 6530 + }, + { + "epoch": 0.7874774232390126, + "grad_norm": 4.304941177368164, + "learning_rate": 9.71687018358263e-07, + "loss": 0.4232, + "step": 6540 + }, + { + "epoch": 0.7886815171583383, + "grad_norm": 4.452000141143799, + "learning_rate": 9.715124714950827e-07, + "loss": 0.4506, + "step": 6550 + }, + { + "epoch": 0.7898856110776641, + "grad_norm": 3.7503676414489746, + "learning_rate": 9.713374040265668e-07, + "loss": 0.4246, + "step": 6560 + }, + { + "epoch": 0.7910897049969897, + "grad_norm": 4.534003257751465, + "learning_rate": 9.71161816146011e-07, + "loss": 0.4247, + "step": 6570 + }, + { + "epoch": 0.7922937989163155, + "grad_norm": 5.637129306793213, + "learning_rate": 9.709857080472845e-07, + "loss": 0.4419, + "step": 6580 + }, + { + "epoch": 0.7934978928356412, + "grad_norm": 3.844273805618286, + "learning_rate": 9.708090799248313e-07, + "loss": 0.4042, + "step": 6590 + }, + { + "epoch": 0.7947019867549668, + "grad_norm": 4.556625843048096, + "learning_rate": 9.706319319736703e-07, + "loss": 0.4384, + "step": 6600 + }, + { + "epoch": 0.7959060806742926, + "grad_norm": 4.6486053466796875, + "learning_rate": 9.70454264389393e-07, + "loss": 0.4091, + "step": 6610 + }, + { + "epoch": 0.7971101745936183, + "grad_norm": 4.751596927642822, + "learning_rate": 9.702760773681658e-07, + "loss": 0.428, + "step": 6620 + }, + { + "epoch": 0.798314268512944, + "grad_norm": 4.64603328704834, + "learning_rate": 9.700973711067282e-07, + "loss": 0.4376, + "step": 6630 + }, + { + "epoch": 0.7995183624322697, + "grad_norm": 4.823798656463623, + "learning_rate": 9.699181458023927e-07, + "loss": 0.4057, + "step": 6640 + }, + { + "epoch": 0.8007224563515954, + "grad_norm": 5.07472562789917, + "learning_rate": 9.697384016530451e-07, + "loss": 0.4103, + "step": 6650 + }, + { + "epoch": 0.8019265502709211, + "grad_norm": 5.586597442626953, + "learning_rate": 9.695581388571444e-07, + "loss": 0.4401, + "step": 6660 + }, + { + "epoch": 0.8031306441902468, + "grad_norm": 5.10539436340332, + "learning_rate": 9.693773576137219e-07, + "loss": 0.4298, + "step": 6670 + }, + { + "epoch": 0.8043347381095726, + "grad_norm": 5.036708354949951, + "learning_rate": 9.691960581223815e-07, + "loss": 0.4299, + "step": 6680 + }, + { + "epoch": 0.8055388320288982, + "grad_norm": 4.794188499450684, + "learning_rate": 9.690142405832988e-07, + "loss": 0.4296, + "step": 6690 + }, + { + "epoch": 0.8067429259482239, + "grad_norm": 4.483447074890137, + "learning_rate": 9.688319051972223e-07, + "loss": 0.4063, + "step": 6700 + }, + { + "epoch": 0.8079470198675497, + "grad_norm": 4.88456916809082, + "learning_rate": 9.686490521654713e-07, + "loss": 0.4548, + "step": 6710 + }, + { + "epoch": 0.8091511137868754, + "grad_norm": 4.166242599487305, + "learning_rate": 9.684656816899374e-07, + "loss": 0.4344, + "step": 6720 + }, + { + "epoch": 0.8103552077062011, + "grad_norm": 4.282528877258301, + "learning_rate": 9.682817939730831e-07, + "loss": 0.4143, + "step": 6730 + }, + { + "epoch": 0.8115593016255268, + "grad_norm": 4.342618942260742, + "learning_rate": 9.680973892179423e-07, + "loss": 0.4224, + "step": 6740 + }, + { + "epoch": 0.8127633955448526, + "grad_norm": 4.768647193908691, + "learning_rate": 9.679124676281195e-07, + "loss": 0.4251, + "step": 6750 + }, + { + "epoch": 0.8139674894641782, + "grad_norm": 4.024239540100098, + "learning_rate": 9.677270294077896e-07, + "loss": 0.4415, + "step": 6760 + }, + { + "epoch": 0.8151715833835039, + "grad_norm": 3.9242262840270996, + "learning_rate": 9.675410747616984e-07, + "loss": 0.4475, + "step": 6770 + }, + { + "epoch": 0.8163756773028297, + "grad_norm": 4.580953121185303, + "learning_rate": 9.67354603895162e-07, + "loss": 0.4067, + "step": 6780 + }, + { + "epoch": 0.8175797712221553, + "grad_norm": 4.859120845794678, + "learning_rate": 9.67167617014066e-07, + "loss": 0.4311, + "step": 6790 + }, + { + "epoch": 0.818783865141481, + "grad_norm": 4.1437835693359375, + "learning_rate": 9.66980114324866e-07, + "loss": 0.4135, + "step": 6800 + }, + { + "epoch": 0.8199879590608068, + "grad_norm": 4.027251243591309, + "learning_rate": 9.667920960345872e-07, + "loss": 0.4021, + "step": 6810 + }, + { + "epoch": 0.8211920529801324, + "grad_norm": 4.283502101898193, + "learning_rate": 9.666035623508237e-07, + "loss": 0.4207, + "step": 6820 + }, + { + "epoch": 0.8223961468994582, + "grad_norm": 4.910589694976807, + "learning_rate": 9.66414513481739e-07, + "loss": 0.4474, + "step": 6830 + }, + { + "epoch": 0.8236002408187839, + "grad_norm": 5.238614559173584, + "learning_rate": 9.662249496360653e-07, + "loss": 0.4294, + "step": 6840 + }, + { + "epoch": 0.8248043347381095, + "grad_norm": 4.113722801208496, + "learning_rate": 9.660348710231036e-07, + "loss": 0.4145, + "step": 6850 + }, + { + "epoch": 0.8260084286574353, + "grad_norm": 4.979987144470215, + "learning_rate": 9.65844277852723e-07, + "loss": 0.421, + "step": 6860 + }, + { + "epoch": 0.827212522576761, + "grad_norm": 5.396749973297119, + "learning_rate": 9.656531703353608e-07, + "loss": 0.4444, + "step": 6870 + }, + { + "epoch": 0.8284166164960867, + "grad_norm": 4.567556858062744, + "learning_rate": 9.654615486820222e-07, + "loss": 0.4198, + "step": 6880 + }, + { + "epoch": 0.8296207104154124, + "grad_norm": 5.2882304191589355, + "learning_rate": 9.6526941310428e-07, + "loss": 0.4274, + "step": 6890 + }, + { + "epoch": 0.8308248043347382, + "grad_norm": 4.51816987991333, + "learning_rate": 9.650767638142746e-07, + "loss": 0.4465, + "step": 6900 + }, + { + "epoch": 0.8320288982540638, + "grad_norm": 3.9410834312438965, + "learning_rate": 9.648836010247137e-07, + "loss": 0.4182, + "step": 6910 + }, + { + "epoch": 0.8332329921733895, + "grad_norm": 4.620553493499756, + "learning_rate": 9.646899249488714e-07, + "loss": 0.4206, + "step": 6920 + }, + { + "epoch": 0.8344370860927153, + "grad_norm": 4.430214881896973, + "learning_rate": 9.644957358005892e-07, + "loss": 0.4313, + "step": 6930 + }, + { + "epoch": 0.8356411800120409, + "grad_norm": 4.277939796447754, + "learning_rate": 9.643010337942747e-07, + "loss": 0.4313, + "step": 6940 + }, + { + "epoch": 0.8368452739313667, + "grad_norm": 5.185015678405762, + "learning_rate": 9.64105819144902e-07, + "loss": 0.4225, + "step": 6950 + }, + { + "epoch": 0.8380493678506924, + "grad_norm": 4.402646541595459, + "learning_rate": 9.63910092068011e-07, + "loss": 0.417, + "step": 6960 + }, + { + "epoch": 0.839253461770018, + "grad_norm": 3.664020538330078, + "learning_rate": 9.637138527797074e-07, + "loss": 0.4337, + "step": 6970 + }, + { + "epoch": 0.8404575556893438, + "grad_norm": 4.9388041496276855, + "learning_rate": 9.635171014966625e-07, + "loss": 0.412, + "step": 6980 + }, + { + "epoch": 0.8416616496086695, + "grad_norm": 4.200076103210449, + "learning_rate": 9.63319838436113e-07, + "loss": 0.4212, + "step": 6990 + }, + { + "epoch": 0.8428657435279951, + "grad_norm": 4.56259822845459, + "learning_rate": 9.631220638158605e-07, + "loss": 0.4316, + "step": 7000 + }, + { + "epoch": 0.8440698374473209, + "grad_norm": 3.910545587539673, + "learning_rate": 9.629237778542714e-07, + "loss": 0.4, + "step": 7010 + }, + { + "epoch": 0.8452739313666466, + "grad_norm": 4.639405250549316, + "learning_rate": 9.62724980770277e-07, + "loss": 0.4084, + "step": 7020 + }, + { + "epoch": 0.8464780252859723, + "grad_norm": 4.84975528717041, + "learning_rate": 9.625256727833725e-07, + "loss": 0.4331, + "step": 7030 + }, + { + "epoch": 0.847682119205298, + "grad_norm": 3.9190306663513184, + "learning_rate": 9.623258541136175e-07, + "loss": 0.4171, + "step": 7040 + }, + { + "epoch": 0.8488862131246238, + "grad_norm": 4.248600482940674, + "learning_rate": 9.621255249816353e-07, + "loss": 0.4255, + "step": 7050 + }, + { + "epoch": 0.8500903070439494, + "grad_norm": 4.055094242095947, + "learning_rate": 9.61924685608613e-07, + "loss": 0.4257, + "step": 7060 + }, + { + "epoch": 0.8512944009632751, + "grad_norm": 4.14054536819458, + "learning_rate": 9.617233362163007e-07, + "loss": 0.4046, + "step": 7070 + }, + { + "epoch": 0.8524984948826009, + "grad_norm": 5.480048179626465, + "learning_rate": 9.61521477027012e-07, + "loss": 0.4007, + "step": 7080 + }, + { + "epoch": 0.8537025888019265, + "grad_norm": 4.100722312927246, + "learning_rate": 9.613191082636232e-07, + "loss": 0.4148, + "step": 7090 + }, + { + "epoch": 0.8549066827212523, + "grad_norm": 3.739861011505127, + "learning_rate": 9.611162301495735e-07, + "loss": 0.4156, + "step": 7100 + }, + { + "epoch": 0.856110776640578, + "grad_norm": 4.769533634185791, + "learning_rate": 9.60912842908864e-07, + "loss": 0.4356, + "step": 7110 + }, + { + "epoch": 0.8573148705599036, + "grad_norm": 4.347903728485107, + "learning_rate": 9.60708946766058e-07, + "loss": 0.4509, + "step": 7120 + }, + { + "epoch": 0.8585189644792294, + "grad_norm": 4.265124797821045, + "learning_rate": 9.605045419462813e-07, + "loss": 0.4231, + "step": 7130 + }, + { + "epoch": 0.8597230583985551, + "grad_norm": 5.108783721923828, + "learning_rate": 9.602996286752206e-07, + "loss": 0.4363, + "step": 7140 + }, + { + "epoch": 0.8609271523178808, + "grad_norm": 5.001750946044922, + "learning_rate": 9.600942071791248e-07, + "loss": 0.4223, + "step": 7150 + }, + { + "epoch": 0.8621312462372065, + "grad_norm": 4.6718668937683105, + "learning_rate": 9.598882776848025e-07, + "loss": 0.4206, + "step": 7160 + }, + { + "epoch": 0.8633353401565322, + "grad_norm": 4.35657262802124, + "learning_rate": 9.596818404196249e-07, + "loss": 0.4136, + "step": 7170 + }, + { + "epoch": 0.8645394340758579, + "grad_norm": 4.119489669799805, + "learning_rate": 9.59474895611523e-07, + "loss": 0.4254, + "step": 7180 + }, + { + "epoch": 0.8657435279951836, + "grad_norm": 4.4842047691345215, + "learning_rate": 9.59267443488988e-07, + "loss": 0.4279, + "step": 7190 + }, + { + "epoch": 0.8669476219145094, + "grad_norm": 4.105453014373779, + "learning_rate": 9.590594842810714e-07, + "loss": 0.4031, + "step": 7200 + }, + { + "epoch": 0.868151715833835, + "grad_norm": 4.400485992431641, + "learning_rate": 9.58851018217385e-07, + "loss": 0.4098, + "step": 7210 + }, + { + "epoch": 0.8693558097531607, + "grad_norm": 4.673033714294434, + "learning_rate": 9.586420455280998e-07, + "loss": 0.4299, + "step": 7220 + }, + { + "epoch": 0.8705599036724865, + "grad_norm": 4.483117580413818, + "learning_rate": 9.584325664439463e-07, + "loss": 0.438, + "step": 7230 + }, + { + "epoch": 0.8717639975918121, + "grad_norm": 5.068016052246094, + "learning_rate": 9.58222581196214e-07, + "loss": 0.4162, + "step": 7240 + }, + { + "epoch": 0.8729680915111379, + "grad_norm": 4.488113880157471, + "learning_rate": 9.580120900167513e-07, + "loss": 0.4196, + "step": 7250 + }, + { + "epoch": 0.8741721854304636, + "grad_norm": 4.887204647064209, + "learning_rate": 9.578010931379654e-07, + "loss": 0.439, + "step": 7260 + }, + { + "epoch": 0.8753762793497892, + "grad_norm": 4.7396159172058105, + "learning_rate": 9.575895907928217e-07, + "loss": 0.4202, + "step": 7270 + }, + { + "epoch": 0.876580373269115, + "grad_norm": 4.224496364593506, + "learning_rate": 9.573775832148438e-07, + "loss": 0.4027, + "step": 7280 + }, + { + "epoch": 0.8777844671884407, + "grad_norm": 5.062420845031738, + "learning_rate": 9.57165070638113e-07, + "loss": 0.4123, + "step": 7290 + }, + { + "epoch": 0.8789885611077664, + "grad_norm": 3.75753116607666, + "learning_rate": 9.569520532972678e-07, + "loss": 0.4066, + "step": 7300 + }, + { + "epoch": 0.8801926550270921, + "grad_norm": 4.535136699676514, + "learning_rate": 9.567385314275054e-07, + "loss": 0.4067, + "step": 7310 + }, + { + "epoch": 0.8813967489464178, + "grad_norm": 4.068704128265381, + "learning_rate": 9.56524505264578e-07, + "loss": 0.4238, + "step": 7320 + }, + { + "epoch": 0.8826008428657435, + "grad_norm": 5.032285690307617, + "learning_rate": 9.563099750447965e-07, + "loss": 0.4392, + "step": 7330 + }, + { + "epoch": 0.8838049367850692, + "grad_norm": 4.432474136352539, + "learning_rate": 9.560949410050274e-07, + "loss": 0.4394, + "step": 7340 + }, + { + "epoch": 0.885009030704395, + "grad_norm": 3.7745227813720703, + "learning_rate": 9.558794033826933e-07, + "loss": 0.4228, + "step": 7350 + }, + { + "epoch": 0.8862131246237207, + "grad_norm": 4.947648525238037, + "learning_rate": 9.556633624157734e-07, + "loss": 0.4324, + "step": 7360 + }, + { + "epoch": 0.8874172185430463, + "grad_norm": 3.695946216583252, + "learning_rate": 9.554468183428025e-07, + "loss": 0.407, + "step": 7370 + }, + { + "epoch": 0.8886213124623721, + "grad_norm": 4.399337291717529, + "learning_rate": 9.552297714028703e-07, + "loss": 0.4313, + "step": 7380 + }, + { + "epoch": 0.8898254063816978, + "grad_norm": 4.042302131652832, + "learning_rate": 9.550122218356227e-07, + "loss": 0.4183, + "step": 7390 + }, + { + "epoch": 0.8910295003010235, + "grad_norm": 4.341307163238525, + "learning_rate": 9.5479416988126e-07, + "loss": 0.4335, + "step": 7400 + }, + { + "epoch": 0.8922335942203492, + "grad_norm": 3.7946054935455322, + "learning_rate": 9.545756157805367e-07, + "loss": 0.4123, + "step": 7410 + }, + { + "epoch": 0.893437688139675, + "grad_norm": 5.04152250289917, + "learning_rate": 9.543565597747632e-07, + "loss": 0.4139, + "step": 7420 + }, + { + "epoch": 0.8946417820590006, + "grad_norm": 3.8958561420440674, + "learning_rate": 9.541370021058023e-07, + "loss": 0.4084, + "step": 7430 + }, + { + "epoch": 0.8958458759783263, + "grad_norm": 3.7490954399108887, + "learning_rate": 9.53916943016072e-07, + "loss": 0.4048, + "step": 7440 + }, + { + "epoch": 0.8970499698976521, + "grad_norm": 4.4821858406066895, + "learning_rate": 9.536963827485434e-07, + "loss": 0.3984, + "step": 7450 + }, + { + "epoch": 0.8982540638169777, + "grad_norm": 4.666491985321045, + "learning_rate": 9.53475321546741e-07, + "loss": 0.4098, + "step": 7460 + }, + { + "epoch": 0.8994581577363034, + "grad_norm": 4.890908718109131, + "learning_rate": 9.532537596547423e-07, + "loss": 0.3982, + "step": 7470 + }, + { + "epoch": 0.9006622516556292, + "grad_norm": 4.651495933532715, + "learning_rate": 9.53031697317178e-07, + "loss": 0.418, + "step": 7480 + }, + { + "epoch": 0.9018663455749548, + "grad_norm": 4.55120849609375, + "learning_rate": 9.528091347792308e-07, + "loss": 0.4187, + "step": 7490 + }, + { + "epoch": 0.9030704394942806, + "grad_norm": 5.57934045791626, + "learning_rate": 9.525860722866362e-07, + "loss": 0.4156, + "step": 7500 + }, + { + "epoch": 0.9042745334136063, + "grad_norm": 3.860431432723999, + "learning_rate": 9.523625100856813e-07, + "loss": 0.4078, + "step": 7510 + }, + { + "epoch": 0.9054786273329319, + "grad_norm": 4.670098781585693, + "learning_rate": 9.521384484232054e-07, + "loss": 0.4088, + "step": 7520 + }, + { + "epoch": 0.9066827212522577, + "grad_norm": 4.332681655883789, + "learning_rate": 9.519138875465986e-07, + "loss": 0.422, + "step": 7530 + }, + { + "epoch": 0.9078868151715834, + "grad_norm": 4.745145797729492, + "learning_rate": 9.516888277038029e-07, + "loss": 0.409, + "step": 7540 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 4.109555721282959, + "learning_rate": 9.514632691433106e-07, + "loss": 0.4177, + "step": 7550 + }, + { + "epoch": 0.9102950030102348, + "grad_norm": 5.039947032928467, + "learning_rate": 9.512372121141652e-07, + "loss": 0.4132, + "step": 7560 + }, + { + "epoch": 0.9114990969295605, + "grad_norm": 4.389688968658447, + "learning_rate": 9.510106568659599e-07, + "loss": 0.4176, + "step": 7570 + }, + { + "epoch": 0.9127031908488862, + "grad_norm": 4.67106819152832, + "learning_rate": 9.50783603648839e-07, + "loss": 0.4441, + "step": 7580 + }, + { + "epoch": 0.9139072847682119, + "grad_norm": 3.6345438957214355, + "learning_rate": 9.505560527134956e-07, + "loss": 0.395, + "step": 7590 + }, + { + "epoch": 0.9151113786875377, + "grad_norm": 4.544852256774902, + "learning_rate": 9.503280043111728e-07, + "loss": 0.4291, + "step": 7600 + }, + { + "epoch": 0.9163154726068633, + "grad_norm": 5.17853307723999, + "learning_rate": 9.50099458693663e-07, + "loss": 0.42, + "step": 7610 + }, + { + "epoch": 0.917519566526189, + "grad_norm": 4.111993789672852, + "learning_rate": 9.498704161133073e-07, + "loss": 0.4086, + "step": 7620 + }, + { + "epoch": 0.9187236604455148, + "grad_norm": 3.93930721282959, + "learning_rate": 9.49640876822996e-07, + "loss": 0.4128, + "step": 7630 + }, + { + "epoch": 0.9199277543648404, + "grad_norm": 4.442197322845459, + "learning_rate": 9.494108410761672e-07, + "loss": 0.4107, + "step": 7640 + }, + { + "epoch": 0.9211318482841662, + "grad_norm": 4.266764163970947, + "learning_rate": 9.491803091268077e-07, + "loss": 0.4093, + "step": 7650 + }, + { + "epoch": 0.9223359422034919, + "grad_norm": 4.633232593536377, + "learning_rate": 9.48949281229452e-07, + "loss": 0.4152, + "step": 7660 + }, + { + "epoch": 0.9235400361228175, + "grad_norm": 4.4745073318481445, + "learning_rate": 9.487177576391818e-07, + "loss": 0.4423, + "step": 7670 + }, + { + "epoch": 0.9247441300421433, + "grad_norm": 3.795365333557129, + "learning_rate": 9.484857386116268e-07, + "loss": 0.4013, + "step": 7680 + }, + { + "epoch": 0.925948223961469, + "grad_norm": 4.76974630355835, + "learning_rate": 9.48253224402963e-07, + "loss": 0.4084, + "step": 7690 + }, + { + "epoch": 0.9271523178807947, + "grad_norm": 4.584947109222412, + "learning_rate": 9.48020215269914e-07, + "loss": 0.4237, + "step": 7700 + }, + { + "epoch": 0.9283564118001204, + "grad_norm": 4.877064228057861, + "learning_rate": 9.477867114697486e-07, + "loss": 0.409, + "step": 7710 + }, + { + "epoch": 0.9295605057194462, + "grad_norm": 4.372793674468994, + "learning_rate": 9.475527132602832e-07, + "loss": 0.4142, + "step": 7720 + }, + { + "epoch": 0.9307645996387718, + "grad_norm": 4.198723316192627, + "learning_rate": 9.473182208998792e-07, + "loss": 0.4057, + "step": 7730 + }, + { + "epoch": 0.9319686935580975, + "grad_norm": 4.460008144378662, + "learning_rate": 9.470832346474435e-07, + "loss": 0.4235, + "step": 7740 + }, + { + "epoch": 0.9331727874774233, + "grad_norm": 4.3058905601501465, + "learning_rate": 9.468477547624289e-07, + "loss": 0.4307, + "step": 7750 + }, + { + "epoch": 0.9343768813967489, + "grad_norm": 4.6467132568359375, + "learning_rate": 9.466117815048329e-07, + "loss": 0.4127, + "step": 7760 + }, + { + "epoch": 0.9355809753160746, + "grad_norm": 5.1491217613220215, + "learning_rate": 9.463753151351978e-07, + "loss": 0.4181, + "step": 7770 + }, + { + "epoch": 0.9367850692354004, + "grad_norm": 5.166205883026123, + "learning_rate": 9.461383559146102e-07, + "loss": 0.4102, + "step": 7780 + }, + { + "epoch": 0.937989163154726, + "grad_norm": 4.453047275543213, + "learning_rate": 9.459009041047012e-07, + "loss": 0.4135, + "step": 7790 + }, + { + "epoch": 0.9391932570740518, + "grad_norm": 5.151276111602783, + "learning_rate": 9.456629599676456e-07, + "loss": 0.4072, + "step": 7800 + }, + { + "epoch": 0.9403973509933775, + "grad_norm": 3.8332607746124268, + "learning_rate": 9.454245237661615e-07, + "loss": 0.4363, + "step": 7810 + }, + { + "epoch": 0.9416014449127031, + "grad_norm": 4.51285982131958, + "learning_rate": 9.451855957635108e-07, + "loss": 0.4265, + "step": 7820 + }, + { + "epoch": 0.9428055388320289, + "grad_norm": 4.756032466888428, + "learning_rate": 9.449461762234981e-07, + "loss": 0.4322, + "step": 7830 + }, + { + "epoch": 0.9440096327513546, + "grad_norm": 3.7539730072021484, + "learning_rate": 9.447062654104707e-07, + "loss": 0.4052, + "step": 7840 + }, + { + "epoch": 0.9452137266706803, + "grad_norm": 4.208081245422363, + "learning_rate": 9.444658635893186e-07, + "loss": 0.4101, + "step": 7850 + }, + { + "epoch": 0.946417820590006, + "grad_norm": 3.338568925857544, + "learning_rate": 9.442249710254737e-07, + "loss": 0.4195, + "step": 7860 + }, + { + "epoch": 0.9476219145093318, + "grad_norm": 4.421904563903809, + "learning_rate": 9.439835879849096e-07, + "loss": 0.4232, + "step": 7870 + }, + { + "epoch": 0.9488260084286574, + "grad_norm": 4.675938129425049, + "learning_rate": 9.437417147341417e-07, + "loss": 0.4171, + "step": 7880 + }, + { + "epoch": 0.9500301023479831, + "grad_norm": 5.047989845275879, + "learning_rate": 9.434993515402267e-07, + "loss": 0.4083, + "step": 7890 + }, + { + "epoch": 0.9512341962673089, + "grad_norm": 4.1763916015625, + "learning_rate": 9.432564986707621e-07, + "loss": 0.3946, + "step": 7900 + }, + { + "epoch": 0.9524382901866345, + "grad_norm": 4.706401348114014, + "learning_rate": 9.43013156393886e-07, + "loss": 0.4147, + "step": 7910 + }, + { + "epoch": 0.9536423841059603, + "grad_norm": 4.3355255126953125, + "learning_rate": 9.427693249782769e-07, + "loss": 0.4244, + "step": 7920 + }, + { + "epoch": 0.954846478025286, + "grad_norm": 5.126685619354248, + "learning_rate": 9.425250046931537e-07, + "loss": 0.4148, + "step": 7930 + }, + { + "epoch": 0.9560505719446116, + "grad_norm": 3.4599716663360596, + "learning_rate": 9.422801958082744e-07, + "loss": 0.4237, + "step": 7940 + }, + { + "epoch": 0.9572546658639374, + "grad_norm": 4.331906795501709, + "learning_rate": 9.420348985939371e-07, + "loss": 0.4097, + "step": 7950 + }, + { + "epoch": 0.9584587597832631, + "grad_norm": 4.4911370277404785, + "learning_rate": 9.417891133209787e-07, + "loss": 0.4029, + "step": 7960 + }, + { + "epoch": 0.9596628537025887, + "grad_norm": 4.601186275482178, + "learning_rate": 9.415428402607754e-07, + "loss": 0.4194, + "step": 7970 + }, + { + "epoch": 0.9608669476219145, + "grad_norm": 4.048129558563232, + "learning_rate": 9.412960796852412e-07, + "loss": 0.4205, + "step": 7980 + }, + { + "epoch": 0.9620710415412402, + "grad_norm": 4.8655571937561035, + "learning_rate": 9.410488318668292e-07, + "loss": 0.4229, + "step": 7990 + }, + { + "epoch": 0.963275135460566, + "grad_norm": 3.7495744228363037, + "learning_rate": 9.408010970785302e-07, + "loss": 0.3761, + "step": 8000 + }, + { + "epoch": 0.9644792293798916, + "grad_norm": 5.3356499671936035, + "learning_rate": 9.405528755938725e-07, + "loss": 0.4093, + "step": 8010 + }, + { + "epoch": 0.9656833232992174, + "grad_norm": 5.407442569732666, + "learning_rate": 9.403041676869217e-07, + "loss": 0.4066, + "step": 8020 + }, + { + "epoch": 0.9668874172185431, + "grad_norm": 3.860828161239624, + "learning_rate": 9.400549736322807e-07, + "loss": 0.3982, + "step": 8030 + }, + { + "epoch": 0.9680915111378687, + "grad_norm": 4.087296962738037, + "learning_rate": 9.398052937050892e-07, + "loss": 0.3951, + "step": 8040 + }, + { + "epoch": 0.9692956050571945, + "grad_norm": 4.309443473815918, + "learning_rate": 9.395551281810233e-07, + "loss": 0.4025, + "step": 8050 + }, + { + "epoch": 0.9704996989765202, + "grad_norm": 4.655600547790527, + "learning_rate": 9.39304477336295e-07, + "loss": 0.4187, + "step": 8060 + }, + { + "epoch": 0.9717037928958459, + "grad_norm": 4.34591007232666, + "learning_rate": 9.390533414476527e-07, + "loss": 0.4164, + "step": 8070 + }, + { + "epoch": 0.9729078868151716, + "grad_norm": 4.547005653381348, + "learning_rate": 9.388017207923798e-07, + "loss": 0.4124, + "step": 8080 + }, + { + "epoch": 0.9741119807344973, + "grad_norm": 5.021882057189941, + "learning_rate": 9.385496156482953e-07, + "loss": 0.4289, + "step": 8090 + }, + { + "epoch": 0.975316074653823, + "grad_norm": 4.165801525115967, + "learning_rate": 9.382970262937526e-07, + "loss": 0.4058, + "step": 8100 + }, + { + "epoch": 0.9765201685731487, + "grad_norm": 4.876884460449219, + "learning_rate": 9.380439530076407e-07, + "loss": 0.43, + "step": 8110 + }, + { + "epoch": 0.9777242624924745, + "grad_norm": 4.928086757659912, + "learning_rate": 9.377903960693818e-07, + "loss": 0.423, + "step": 8120 + }, + { + "epoch": 0.9789283564118001, + "grad_norm": 5.006045341491699, + "learning_rate": 9.375363557589331e-07, + "loss": 0.4354, + "step": 8130 + }, + { + "epoch": 0.9801324503311258, + "grad_norm": 3.8796417713165283, + "learning_rate": 9.372818323567846e-07, + "loss": 0.4132, + "step": 8140 + }, + { + "epoch": 0.9813365442504516, + "grad_norm": 4.275393962860107, + "learning_rate": 9.370268261439604e-07, + "loss": 0.4071, + "step": 8150 + }, + { + "epoch": 0.9825406381697772, + "grad_norm": 5.467378616333008, + "learning_rate": 9.367713374020174e-07, + "loss": 0.4049, + "step": 8160 + }, + { + "epoch": 0.983744732089103, + "grad_norm": 3.720611095428467, + "learning_rate": 9.365153664130453e-07, + "loss": 0.4008, + "step": 8170 + }, + { + "epoch": 0.9849488260084287, + "grad_norm": 4.539004802703857, + "learning_rate": 9.362589134596661e-07, + "loss": 0.4118, + "step": 8180 + }, + { + "epoch": 0.9861529199277543, + "grad_norm": 3.776636838912964, + "learning_rate": 9.360019788250342e-07, + "loss": 0.4334, + "step": 8190 + }, + { + "epoch": 0.9873570138470801, + "grad_norm": 3.8309648036956787, + "learning_rate": 9.357445627928355e-07, + "loss": 0.4179, + "step": 8200 + }, + { + "epoch": 0.9885611077664058, + "grad_norm": 4.798840045928955, + "learning_rate": 9.354866656472881e-07, + "loss": 0.4154, + "step": 8210 + }, + { + "epoch": 0.9897652016857315, + "grad_norm": 4.182796955108643, + "learning_rate": 9.352282876731403e-07, + "loss": 0.4196, + "step": 8220 + }, + { + "epoch": 0.9909692956050572, + "grad_norm": 4.6675801277160645, + "learning_rate": 9.349694291556723e-07, + "loss": 0.4182, + "step": 8230 + }, + { + "epoch": 0.9921733895243829, + "grad_norm": 4.432309627532959, + "learning_rate": 9.347100903806941e-07, + "loss": 0.4206, + "step": 8240 + }, + { + "epoch": 0.9933774834437086, + "grad_norm": 4.616915702819824, + "learning_rate": 9.344502716345463e-07, + "loss": 0.4153, + "step": 8250 + }, + { + "epoch": 0.9945815773630343, + "grad_norm": 4.290421485900879, + "learning_rate": 9.341899732040994e-07, + "loss": 0.4162, + "step": 8260 + }, + { + "epoch": 0.9957856712823601, + "grad_norm": 4.533810138702393, + "learning_rate": 9.339291953767539e-07, + "loss": 0.4113, + "step": 8270 + }, + { + "epoch": 0.9969897652016857, + "grad_norm": 4.271683692932129, + "learning_rate": 9.336679384404387e-07, + "loss": 0.4166, + "step": 8280 + }, + { + "epoch": 0.9981938591210114, + "grad_norm": 5.167937755584717, + "learning_rate": 9.334062026836127e-07, + "loss": 0.385, + "step": 8290 + }, + { + "epoch": 0.9993979530403372, + "grad_norm": 4.525483131408691, + "learning_rate": 9.331439883952628e-07, + "loss": 0.3977, + "step": 8300 + }, + { + "epoch": 1.0006020469596628, + "grad_norm": 4.6253533363342285, + "learning_rate": 9.328812958649044e-07, + "loss": 0.4123, + "step": 8310 + }, + { + "epoch": 1.0018061408789887, + "grad_norm": 5.165332317352295, + "learning_rate": 9.326181253825812e-07, + "loss": 0.3842, + "step": 8320 + }, + { + "epoch": 1.0030102347983143, + "grad_norm": 3.894192934036255, + "learning_rate": 9.323544772388645e-07, + "loss": 0.3528, + "step": 8330 + }, + { + "epoch": 1.00421432871764, + "grad_norm": 3.8034422397613525, + "learning_rate": 9.320903517248527e-07, + "loss": 0.3817, + "step": 8340 + }, + { + "epoch": 1.0054184226369658, + "grad_norm": 4.677804946899414, + "learning_rate": 9.318257491321714e-07, + "loss": 0.3772, + "step": 8350 + }, + { + "epoch": 1.0066225165562914, + "grad_norm": 4.256035327911377, + "learning_rate": 9.315606697529733e-07, + "loss": 0.3858, + "step": 8360 + }, + { + "epoch": 1.007826610475617, + "grad_norm": 4.362122058868408, + "learning_rate": 9.312951138799371e-07, + "loss": 0.3702, + "step": 8370 + }, + { + "epoch": 1.009030704394943, + "grad_norm": 4.146007537841797, + "learning_rate": 9.310290818062681e-07, + "loss": 0.3869, + "step": 8380 + }, + { + "epoch": 1.0102347983142685, + "grad_norm": 4.480301856994629, + "learning_rate": 9.307625738256967e-07, + "loss": 0.4082, + "step": 8390 + }, + { + "epoch": 1.0114388922335942, + "grad_norm": 4.406433582305908, + "learning_rate": 9.304955902324793e-07, + "loss": 0.3846, + "step": 8400 + }, + { + "epoch": 1.01264298615292, + "grad_norm": 4.386068820953369, + "learning_rate": 9.302281313213972e-07, + "loss": 0.3806, + "step": 8410 + }, + { + "epoch": 1.0138470800722457, + "grad_norm": 4.706192970275879, + "learning_rate": 9.299601973877566e-07, + "loss": 0.385, + "step": 8420 + }, + { + "epoch": 1.0150511739915713, + "grad_norm": 5.003023624420166, + "learning_rate": 9.29691788727388e-07, + "loss": 0.3785, + "step": 8430 + }, + { + "epoch": 1.0162552679108972, + "grad_norm": 4.118617534637451, + "learning_rate": 9.294229056366463e-07, + "loss": 0.3649, + "step": 8440 + }, + { + "epoch": 1.0174593618302228, + "grad_norm": 4.070971488952637, + "learning_rate": 9.291535484124101e-07, + "loss": 0.3897, + "step": 8450 + }, + { + "epoch": 1.0186634557495484, + "grad_norm": 4.141367435455322, + "learning_rate": 9.288837173520814e-07, + "loss": 0.3712, + "step": 8460 + }, + { + "epoch": 1.0198675496688743, + "grad_norm": 4.00056791305542, + "learning_rate": 9.286134127535859e-07, + "loss": 0.372, + "step": 8470 + }, + { + "epoch": 1.0210716435882, + "grad_norm": 4.618954658508301, + "learning_rate": 9.283426349153711e-07, + "loss": 0.3708, + "step": 8480 + }, + { + "epoch": 1.0222757375075255, + "grad_norm": 4.50954008102417, + "learning_rate": 9.280713841364083e-07, + "loss": 0.3831, + "step": 8490 + }, + { + "epoch": 1.0234798314268514, + "grad_norm": 4.025129795074463, + "learning_rate": 9.277996607161898e-07, + "loss": 0.3807, + "step": 8500 + }, + { + "epoch": 1.024683925346177, + "grad_norm": 4.727366924285889, + "learning_rate": 9.275274649547307e-07, + "loss": 0.3707, + "step": 8510 + }, + { + "epoch": 1.0258880192655027, + "grad_norm": 4.731372833251953, + "learning_rate": 9.272547971525669e-07, + "loss": 0.3655, + "step": 8520 + }, + { + "epoch": 1.0270921131848285, + "grad_norm": 4.237710475921631, + "learning_rate": 9.269816576107559e-07, + "loss": 0.365, + "step": 8530 + }, + { + "epoch": 1.0282962071041541, + "grad_norm": 4.294924736022949, + "learning_rate": 9.267080466308758e-07, + "loss": 0.3774, + "step": 8540 + }, + { + "epoch": 1.0295003010234798, + "grad_norm": 4.249452590942383, + "learning_rate": 9.264339645150256e-07, + "loss": 0.372, + "step": 8550 + }, + { + "epoch": 1.0307043949428056, + "grad_norm": 4.078114986419678, + "learning_rate": 9.26159411565824e-07, + "loss": 0.3736, + "step": 8560 + }, + { + "epoch": 1.0319084888621313, + "grad_norm": 5.815018177032471, + "learning_rate": 9.258843880864101e-07, + "loss": 0.3708, + "step": 8570 + }, + { + "epoch": 1.033112582781457, + "grad_norm": 4.562671184539795, + "learning_rate": 9.256088943804421e-07, + "loss": 0.3926, + "step": 8580 + }, + { + "epoch": 1.0343166767007828, + "grad_norm": 5.159687042236328, + "learning_rate": 9.253329307520974e-07, + "loss": 0.3754, + "step": 8590 + }, + { + "epoch": 1.0355207706201084, + "grad_norm": 4.418034076690674, + "learning_rate": 9.250564975060725e-07, + "loss": 0.3756, + "step": 8600 + }, + { + "epoch": 1.036724864539434, + "grad_norm": 4.661262035369873, + "learning_rate": 9.247795949475823e-07, + "loss": 0.3854, + "step": 8610 + }, + { + "epoch": 1.0379289584587599, + "grad_norm": 4.768362522125244, + "learning_rate": 9.245022233823598e-07, + "loss": 0.3798, + "step": 8620 + }, + { + "epoch": 1.0391330523780855, + "grad_norm": 4.090877056121826, + "learning_rate": 9.242243831166558e-07, + "loss": 0.3883, + "step": 8630 + }, + { + "epoch": 1.0403371462974111, + "grad_norm": 4.2338032722473145, + "learning_rate": 9.23946074457239e-07, + "loss": 0.3784, + "step": 8640 + }, + { + "epoch": 1.041541240216737, + "grad_norm": 4.812314510345459, + "learning_rate": 9.236672977113947e-07, + "loss": 0.3938, + "step": 8650 + }, + { + "epoch": 1.0427453341360626, + "grad_norm": 5.055131435394287, + "learning_rate": 9.233880531869253e-07, + "loss": 0.3784, + "step": 8660 + }, + { + "epoch": 1.0439494280553883, + "grad_norm": 4.143119812011719, + "learning_rate": 9.231083411921497e-07, + "loss": 0.368, + "step": 8670 + }, + { + "epoch": 1.0451535219747141, + "grad_norm": 4.840368270874023, + "learning_rate": 9.228281620359029e-07, + "loss": 0.3771, + "step": 8680 + }, + { + "epoch": 1.0463576158940397, + "grad_norm": 4.708595275878906, + "learning_rate": 9.225475160275358e-07, + "loss": 0.3572, + "step": 8690 + }, + { + "epoch": 1.0475617098133654, + "grad_norm": 4.715826511383057, + "learning_rate": 9.222664034769145e-07, + "loss": 0.3929, + "step": 8700 + }, + { + "epoch": 1.0487658037326912, + "grad_norm": 4.969057559967041, + "learning_rate": 9.219848246944205e-07, + "loss": 0.3895, + "step": 8710 + }, + { + "epoch": 1.0499698976520169, + "grad_norm": 3.3560914993286133, + "learning_rate": 9.217027799909499e-07, + "loss": 0.379, + "step": 8720 + }, + { + "epoch": 1.0511739915713425, + "grad_norm": 4.196804046630859, + "learning_rate": 9.214202696779134e-07, + "loss": 0.3692, + "step": 8730 + }, + { + "epoch": 1.0523780854906684, + "grad_norm": 4.214865684509277, + "learning_rate": 9.211372940672355e-07, + "loss": 0.3673, + "step": 8740 + }, + { + "epoch": 1.053582179409994, + "grad_norm": 4.642685890197754, + "learning_rate": 9.208538534713548e-07, + "loss": 0.3961, + "step": 8750 + }, + { + "epoch": 1.0547862733293196, + "grad_norm": 4.921828269958496, + "learning_rate": 9.20569948203223e-07, + "loss": 0.3616, + "step": 8760 + }, + { + "epoch": 1.0559903672486455, + "grad_norm": 3.9251582622528076, + "learning_rate": 9.202855785763051e-07, + "loss": 0.3958, + "step": 8770 + }, + { + "epoch": 1.0571944611679711, + "grad_norm": 4.475203990936279, + "learning_rate": 9.200007449045785e-07, + "loss": 0.3782, + "step": 8780 + }, + { + "epoch": 1.0583985550872967, + "grad_norm": 4.735462665557861, + "learning_rate": 9.197154475025333e-07, + "loss": 0.3571, + "step": 8790 + }, + { + "epoch": 1.0596026490066226, + "grad_norm": 4.720487117767334, + "learning_rate": 9.194296866851712e-07, + "loss": 0.3632, + "step": 8800 + }, + { + "epoch": 1.0608067429259482, + "grad_norm": 4.291871547698975, + "learning_rate": 9.191434627680063e-07, + "loss": 0.3722, + "step": 8810 + }, + { + "epoch": 1.0620108368452739, + "grad_norm": 4.449291229248047, + "learning_rate": 9.188567760670631e-07, + "loss": 0.3857, + "step": 8820 + }, + { + "epoch": 1.0632149307645997, + "grad_norm": 4.42001485824585, + "learning_rate": 9.185696268988776e-07, + "loss": 0.3798, + "step": 8830 + }, + { + "epoch": 1.0644190246839254, + "grad_norm": 4.68118953704834, + "learning_rate": 9.182820155804965e-07, + "loss": 0.364, + "step": 8840 + }, + { + "epoch": 1.065623118603251, + "grad_norm": 4.831759929656982, + "learning_rate": 9.179939424294763e-07, + "loss": 0.3656, + "step": 8850 + }, + { + "epoch": 1.0668272125225768, + "grad_norm": 4.51068115234375, + "learning_rate": 9.177054077638839e-07, + "loss": 0.3779, + "step": 8860 + }, + { + "epoch": 1.0680313064419025, + "grad_norm": 4.588883399963379, + "learning_rate": 9.174164119022956e-07, + "loss": 0.3766, + "step": 8870 + }, + { + "epoch": 1.069235400361228, + "grad_norm": 4.487590789794922, + "learning_rate": 9.171269551637968e-07, + "loss": 0.3676, + "step": 8880 + }, + { + "epoch": 1.070439494280554, + "grad_norm": 5.2501702308654785, + "learning_rate": 9.168370378679819e-07, + "loss": 0.3764, + "step": 8890 + }, + { + "epoch": 1.0716435881998796, + "grad_norm": 4.199159145355225, + "learning_rate": 9.165466603349539e-07, + "loss": 0.3736, + "step": 8900 + }, + { + "epoch": 1.0728476821192052, + "grad_norm": 4.138830184936523, + "learning_rate": 9.162558228853235e-07, + "loss": 0.3745, + "step": 8910 + }, + { + "epoch": 1.074051776038531, + "grad_norm": 4.139305114746094, + "learning_rate": 9.159645258402095e-07, + "loss": 0.3693, + "step": 8920 + }, + { + "epoch": 1.0752558699578567, + "grad_norm": 5.9480438232421875, + "learning_rate": 9.156727695212386e-07, + "loss": 0.3644, + "step": 8930 + }, + { + "epoch": 1.0764599638771823, + "grad_norm": 4.251008987426758, + "learning_rate": 9.153805542505438e-07, + "loss": 0.3844, + "step": 8940 + }, + { + "epoch": 1.0776640577965082, + "grad_norm": 4.630239486694336, + "learning_rate": 9.150878803507654e-07, + "loss": 0.3699, + "step": 8950 + }, + { + "epoch": 1.0788681517158338, + "grad_norm": 5.171538829803467, + "learning_rate": 9.147947481450498e-07, + "loss": 0.4026, + "step": 8960 + }, + { + "epoch": 1.0800722456351595, + "grad_norm": 4.777914524078369, + "learning_rate": 9.145011579570491e-07, + "loss": 0.3642, + "step": 8970 + }, + { + "epoch": 1.0812763395544853, + "grad_norm": 5.336880207061768, + "learning_rate": 9.142071101109224e-07, + "loss": 0.3926, + "step": 8980 + }, + { + "epoch": 1.082480433473811, + "grad_norm": 3.8747503757476807, + "learning_rate": 9.139126049313321e-07, + "loss": 0.3792, + "step": 8990 + }, + { + "epoch": 1.0836845273931366, + "grad_norm": 4.528430461883545, + "learning_rate": 9.136176427434475e-07, + "loss": 0.3735, + "step": 9000 + }, + { + "epoch": 1.0848886213124624, + "grad_norm": 5.05435848236084, + "learning_rate": 9.133222238729412e-07, + "loss": 0.3604, + "step": 9010 + }, + { + "epoch": 1.086092715231788, + "grad_norm": 4.354115962982178, + "learning_rate": 9.130263486459904e-07, + "loss": 0.3995, + "step": 9020 + }, + { + "epoch": 1.0872968091511137, + "grad_norm": 5.124173164367676, + "learning_rate": 9.127300173892763e-07, + "loss": 0.3622, + "step": 9030 + }, + { + "epoch": 1.0885009030704396, + "grad_norm": 4.644625186920166, + "learning_rate": 9.124332304299838e-07, + "loss": 0.3704, + "step": 9040 + }, + { + "epoch": 1.0897049969897652, + "grad_norm": 4.276961803436279, + "learning_rate": 9.121359880958002e-07, + "loss": 0.3771, + "step": 9050 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 4.1808648109436035, + "learning_rate": 9.118382907149163e-07, + "loss": 0.3638, + "step": 9060 + }, + { + "epoch": 1.0921131848284167, + "grad_norm": 4.921030521392822, + "learning_rate": 9.115401386160251e-07, + "loss": 0.3633, + "step": 9070 + }, + { + "epoch": 1.0933172787477423, + "grad_norm": 4.0871663093566895, + "learning_rate": 9.112415321283217e-07, + "loss": 0.358, + "step": 9080 + }, + { + "epoch": 1.094521372667068, + "grad_norm": 3.419311046600342, + "learning_rate": 9.10942471581503e-07, + "loss": 0.3601, + "step": 9090 + }, + { + "epoch": 1.0957254665863938, + "grad_norm": 4.138514518737793, + "learning_rate": 9.106429573057666e-07, + "loss": 0.3764, + "step": 9100 + }, + { + "epoch": 1.0969295605057194, + "grad_norm": 5.0829691886901855, + "learning_rate": 9.10342989631812e-07, + "loss": 0.3756, + "step": 9110 + }, + { + "epoch": 1.098133654425045, + "grad_norm": 4.330390930175781, + "learning_rate": 9.100425688908386e-07, + "loss": 0.3587, + "step": 9120 + }, + { + "epoch": 1.099337748344371, + "grad_norm": 5.1065592765808105, + "learning_rate": 9.097416954145465e-07, + "loss": 0.38, + "step": 9130 + }, + { + "epoch": 1.1005418422636966, + "grad_norm": 4.509856224060059, + "learning_rate": 9.094403695351352e-07, + "loss": 0.38, + "step": 9140 + }, + { + "epoch": 1.1017459361830222, + "grad_norm": 5.324617862701416, + "learning_rate": 9.091385915853042e-07, + "loss": 0.3658, + "step": 9150 + }, + { + "epoch": 1.102950030102348, + "grad_norm": 5.061591148376465, + "learning_rate": 9.088363618982521e-07, + "loss": 0.3723, + "step": 9160 + }, + { + "epoch": 1.1041541240216737, + "grad_norm": 5.028870582580566, + "learning_rate": 9.085336808076758e-07, + "loss": 0.3837, + "step": 9170 + }, + { + "epoch": 1.1053582179409993, + "grad_norm": 4.214852809906006, + "learning_rate": 9.082305486477708e-07, + "loss": 0.3681, + "step": 9180 + }, + { + "epoch": 1.1065623118603252, + "grad_norm": 4.787420272827148, + "learning_rate": 9.079269657532311e-07, + "loss": 0.3843, + "step": 9190 + }, + { + "epoch": 1.1077664057796508, + "grad_norm": 3.78640079498291, + "learning_rate": 9.076229324592477e-07, + "loss": 0.3747, + "step": 9200 + }, + { + "epoch": 1.1089704996989764, + "grad_norm": 4.786212921142578, + "learning_rate": 9.073184491015094e-07, + "loss": 0.3684, + "step": 9210 + }, + { + "epoch": 1.1101745936183023, + "grad_norm": 3.932164430618286, + "learning_rate": 9.070135160162015e-07, + "loss": 0.3662, + "step": 9220 + }, + { + "epoch": 1.111378687537628, + "grad_norm": 4.249774932861328, + "learning_rate": 9.067081335400061e-07, + "loss": 0.3722, + "step": 9230 + }, + { + "epoch": 1.1125827814569536, + "grad_norm": 4.269323348999023, + "learning_rate": 9.064023020101015e-07, + "loss": 0.3765, + "step": 9240 + }, + { + "epoch": 1.1137868753762794, + "grad_norm": 4.183831214904785, + "learning_rate": 9.060960217641617e-07, + "loss": 0.3657, + "step": 9250 + }, + { + "epoch": 1.114990969295605, + "grad_norm": 4.336716175079346, + "learning_rate": 9.057892931403563e-07, + "loss": 0.3869, + "step": 9260 + }, + { + "epoch": 1.1161950632149307, + "grad_norm": 4.948883533477783, + "learning_rate": 9.054821164773498e-07, + "loss": 0.3823, + "step": 9270 + }, + { + "epoch": 1.1173991571342565, + "grad_norm": 4.687775611877441, + "learning_rate": 9.051744921143014e-07, + "loss": 0.3853, + "step": 9280 + }, + { + "epoch": 1.1186032510535822, + "grad_norm": 4.803307056427002, + "learning_rate": 9.048664203908647e-07, + "loss": 0.3609, + "step": 9290 + }, + { + "epoch": 1.1198073449729078, + "grad_norm": 4.377987861633301, + "learning_rate": 9.045579016471871e-07, + "loss": 0.3873, + "step": 9300 + }, + { + "epoch": 1.1210114388922336, + "grad_norm": 4.264991760253906, + "learning_rate": 9.042489362239096e-07, + "loss": 0.3663, + "step": 9310 + }, + { + "epoch": 1.1222155328115593, + "grad_norm": 4.69897985458374, + "learning_rate": 9.039395244621667e-07, + "loss": 0.3797, + "step": 9320 + }, + { + "epoch": 1.123419626730885, + "grad_norm": 4.6573357582092285, + "learning_rate": 9.036296667035853e-07, + "loss": 0.3774, + "step": 9330 + }, + { + "epoch": 1.1246237206502108, + "grad_norm": 4.6396307945251465, + "learning_rate": 9.033193632902848e-07, + "loss": 0.3708, + "step": 9340 + }, + { + "epoch": 1.1258278145695364, + "grad_norm": 4.781702518463135, + "learning_rate": 9.030086145648767e-07, + "loss": 0.366, + "step": 9350 + }, + { + "epoch": 1.127031908488862, + "grad_norm": 3.859081745147705, + "learning_rate": 9.026974208704645e-07, + "loss": 0.3592, + "step": 9360 + }, + { + "epoch": 1.1282360024081879, + "grad_norm": 3.917964220046997, + "learning_rate": 9.023857825506425e-07, + "loss": 0.3828, + "step": 9370 + }, + { + "epoch": 1.1294400963275135, + "grad_norm": 4.249654293060303, + "learning_rate": 9.020736999494962e-07, + "loss": 0.3816, + "step": 9380 + }, + { + "epoch": 1.1306441902468394, + "grad_norm": 4.181410789489746, + "learning_rate": 9.017611734116015e-07, + "loss": 0.3881, + "step": 9390 + }, + { + "epoch": 1.131848284166165, + "grad_norm": 4.529959678649902, + "learning_rate": 9.014482032820245e-07, + "loss": 0.3866, + "step": 9400 + }, + { + "epoch": 1.1330523780854906, + "grad_norm": 4.115703105926514, + "learning_rate": 9.011347899063212e-07, + "loss": 0.4017, + "step": 9410 + }, + { + "epoch": 1.1342564720048163, + "grad_norm": 5.330405235290527, + "learning_rate": 9.008209336305369e-07, + "loss": 0.382, + "step": 9420 + }, + { + "epoch": 1.1354605659241421, + "grad_norm": 4.53489351272583, + "learning_rate": 9.005066348012058e-07, + "loss": 0.4002, + "step": 9430 + }, + { + "epoch": 1.1366646598434678, + "grad_norm": 4.984791278839111, + "learning_rate": 9.00191893765351e-07, + "loss": 0.3699, + "step": 9440 + }, + { + "epoch": 1.1378687537627936, + "grad_norm": 4.83209753036499, + "learning_rate": 8.998767108704836e-07, + "loss": 0.3612, + "step": 9450 + }, + { + "epoch": 1.1390728476821192, + "grad_norm": 4.549959659576416, + "learning_rate": 8.995610864646029e-07, + "loss": 0.3552, + "step": 9460 + }, + { + "epoch": 1.1402769416014449, + "grad_norm": 4.30760383605957, + "learning_rate": 8.992450208961949e-07, + "loss": 0.3796, + "step": 9470 + }, + { + "epoch": 1.1414810355207705, + "grad_norm": 4.3470234870910645, + "learning_rate": 8.989285145142338e-07, + "loss": 0.3868, + "step": 9480 + }, + { + "epoch": 1.1426851294400964, + "grad_norm": 4.755895614624023, + "learning_rate": 8.986115676681796e-07, + "loss": 0.3867, + "step": 9490 + }, + { + "epoch": 1.143889223359422, + "grad_norm": 4.874184608459473, + "learning_rate": 8.982941807079791e-07, + "loss": 0.3866, + "step": 9500 + }, + { + "epoch": 1.1450933172787479, + "grad_norm": 4.068636894226074, + "learning_rate": 8.979763539840649e-07, + "loss": 0.3558, + "step": 9510 + }, + { + "epoch": 1.1462974111980735, + "grad_norm": 4.380646705627441, + "learning_rate": 8.976580878473552e-07, + "loss": 0.3704, + "step": 9520 + }, + { + "epoch": 1.1475015051173991, + "grad_norm": 4.3028950691223145, + "learning_rate": 8.973393826492531e-07, + "loss": 0.3995, + "step": 9530 + }, + { + "epoch": 1.1487055990367248, + "grad_norm": 4.423670768737793, + "learning_rate": 8.97020238741647e-07, + "loss": 0.38, + "step": 9540 + }, + { + "epoch": 1.1499096929560506, + "grad_norm": 4.808249473571777, + "learning_rate": 8.967006564769093e-07, + "loss": 0.3779, + "step": 9550 + }, + { + "epoch": 1.1511137868753762, + "grad_norm": 5.734920501708984, + "learning_rate": 8.963806362078963e-07, + "loss": 0.3713, + "step": 9560 + }, + { + "epoch": 1.152317880794702, + "grad_norm": 4.730371952056885, + "learning_rate": 8.960601782879483e-07, + "loss": 0.3583, + "step": 9570 + }, + { + "epoch": 1.1535219747140277, + "grad_norm": 5.035944938659668, + "learning_rate": 8.957392830708886e-07, + "loss": 0.39, + "step": 9580 + }, + { + "epoch": 1.1547260686333534, + "grad_norm": 4.2402119636535645, + "learning_rate": 8.95417950911023e-07, + "loss": 0.3655, + "step": 9590 + }, + { + "epoch": 1.155930162552679, + "grad_norm": 3.995563507080078, + "learning_rate": 8.950961821631406e-07, + "loss": 0.3657, + "step": 9600 + }, + { + "epoch": 1.1571342564720049, + "grad_norm": 5.285823822021484, + "learning_rate": 8.947739771825117e-07, + "loss": 0.3825, + "step": 9610 + }, + { + "epoch": 1.1583383503913305, + "grad_norm": 4.332102298736572, + "learning_rate": 8.944513363248885e-07, + "loss": 0.3808, + "step": 9620 + }, + { + "epoch": 1.1595424443106563, + "grad_norm": 4.714332103729248, + "learning_rate": 8.941282599465047e-07, + "loss": 0.3904, + "step": 9630 + }, + { + "epoch": 1.160746538229982, + "grad_norm": 3.8975484371185303, + "learning_rate": 8.938047484040748e-07, + "loss": 0.3559, + "step": 9640 + }, + { + "epoch": 1.1619506321493076, + "grad_norm": 4.700948238372803, + "learning_rate": 8.934808020547935e-07, + "loss": 0.3676, + "step": 9650 + }, + { + "epoch": 1.1631547260686332, + "grad_norm": 4.926019191741943, + "learning_rate": 8.931564212563356e-07, + "loss": 0.3913, + "step": 9660 + }, + { + "epoch": 1.164358819987959, + "grad_norm": 4.402989864349365, + "learning_rate": 8.92831606366856e-07, + "loss": 0.3672, + "step": 9670 + }, + { + "epoch": 1.1655629139072847, + "grad_norm": 4.371270656585693, + "learning_rate": 8.925063577449886e-07, + "loss": 0.3529, + "step": 9680 + }, + { + "epoch": 1.1667670078266106, + "grad_norm": 5.072457790374756, + "learning_rate": 8.92180675749846e-07, + "loss": 0.3703, + "step": 9690 + }, + { + "epoch": 1.1679711017459362, + "grad_norm": 5.789607524871826, + "learning_rate": 8.918545607410197e-07, + "loss": 0.3618, + "step": 9700 + }, + { + "epoch": 1.1691751956652618, + "grad_norm": 4.929603576660156, + "learning_rate": 8.91528013078579e-07, + "loss": 0.3632, + "step": 9710 + }, + { + "epoch": 1.1703792895845875, + "grad_norm": 4.385134220123291, + "learning_rate": 8.91201033123071e-07, + "loss": 0.3726, + "step": 9720 + }, + { + "epoch": 1.1715833835039133, + "grad_norm": 4.493896961212158, + "learning_rate": 8.908736212355201e-07, + "loss": 0.396, + "step": 9730 + }, + { + "epoch": 1.172787477423239, + "grad_norm": 5.4288859367370605, + "learning_rate": 8.905457777774278e-07, + "loss": 0.3693, + "step": 9740 + }, + { + "epoch": 1.1739915713425648, + "grad_norm": 4.925263404846191, + "learning_rate": 8.902175031107717e-07, + "loss": 0.3809, + "step": 9750 + }, + { + "epoch": 1.1751956652618905, + "grad_norm": 4.450766086578369, + "learning_rate": 8.898887975980058e-07, + "loss": 0.3747, + "step": 9760 + }, + { + "epoch": 1.176399759181216, + "grad_norm": 5.003162860870361, + "learning_rate": 8.895596616020595e-07, + "loss": 0.3763, + "step": 9770 + }, + { + "epoch": 1.1776038531005417, + "grad_norm": 5.204108238220215, + "learning_rate": 8.89230095486338e-07, + "loss": 0.3983, + "step": 9780 + }, + { + "epoch": 1.1788079470198676, + "grad_norm": 5.1089372634887695, + "learning_rate": 8.889000996147213e-07, + "loss": 0.3757, + "step": 9790 + }, + { + "epoch": 1.1800120409391932, + "grad_norm": 5.394412994384766, + "learning_rate": 8.885696743515632e-07, + "loss": 0.3764, + "step": 9800 + }, + { + "epoch": 1.181216134858519, + "grad_norm": 4.811611175537109, + "learning_rate": 8.882388200616926e-07, + "loss": 0.3686, + "step": 9810 + }, + { + "epoch": 1.1824202287778447, + "grad_norm": 4.908543109893799, + "learning_rate": 8.879075371104113e-07, + "loss": 0.368, + "step": 9820 + }, + { + "epoch": 1.1836243226971703, + "grad_norm": 4.540360450744629, + "learning_rate": 8.875758258634949e-07, + "loss": 0.3698, + "step": 9830 + }, + { + "epoch": 1.1848284166164962, + "grad_norm": 4.033935546875, + "learning_rate": 8.872436866871917e-07, + "loss": 0.3522, + "step": 9840 + }, + { + "epoch": 1.1860325105358218, + "grad_norm": 5.225256443023682, + "learning_rate": 8.869111199482225e-07, + "loss": 0.3837, + "step": 9850 + }, + { + "epoch": 1.1872366044551474, + "grad_norm": 4.02462100982666, + "learning_rate": 8.865781260137801e-07, + "loss": 0.381, + "step": 9860 + }, + { + "epoch": 1.1884406983744733, + "grad_norm": 4.905768871307373, + "learning_rate": 8.862447052515291e-07, + "loss": 0.384, + "step": 9870 + }, + { + "epoch": 1.189644792293799, + "grad_norm": 4.620838642120361, + "learning_rate": 8.859108580296053e-07, + "loss": 0.3533, + "step": 9880 + }, + { + "epoch": 1.1908488862131246, + "grad_norm": 4.312672138214111, + "learning_rate": 8.855765847166154e-07, + "loss": 0.3591, + "step": 9890 + }, + { + "epoch": 1.1920529801324504, + "grad_norm": 4.337918758392334, + "learning_rate": 8.852418856816365e-07, + "loss": 0.374, + "step": 9900 + }, + { + "epoch": 1.193257074051776, + "grad_norm": 4.154960632324219, + "learning_rate": 8.849067612942158e-07, + "loss": 0.3551, + "step": 9910 + }, + { + "epoch": 1.1944611679711017, + "grad_norm": 4.451188564300537, + "learning_rate": 8.845712119243701e-07, + "loss": 0.3699, + "step": 9920 + }, + { + "epoch": 1.1956652618904275, + "grad_norm": 5.723966598510742, + "learning_rate": 8.842352379425853e-07, + "loss": 0.3875, + "step": 9930 + }, + { + "epoch": 1.1968693558097532, + "grad_norm": 4.982749938964844, + "learning_rate": 8.838988397198166e-07, + "loss": 0.375, + "step": 9940 + }, + { + "epoch": 1.1980734497290788, + "grad_norm": 4.661801338195801, + "learning_rate": 8.835620176274869e-07, + "loss": 0.3721, + "step": 9950 + }, + { + "epoch": 1.1992775436484047, + "grad_norm": 5.228112697601318, + "learning_rate": 8.832247720374879e-07, + "loss": 0.366, + "step": 9960 + }, + { + "epoch": 1.2004816375677303, + "grad_norm": 4.082928657531738, + "learning_rate": 8.828871033221782e-07, + "loss": 0.3621, + "step": 9970 + }, + { + "epoch": 1.201685731487056, + "grad_norm": 3.532892942428589, + "learning_rate": 8.82549011854384e-07, + "loss": 0.365, + "step": 9980 + }, + { + "epoch": 1.2028898254063818, + "grad_norm": 4.03758430480957, + "learning_rate": 8.822104980073978e-07, + "loss": 0.3786, + "step": 9990 + }, + { + "epoch": 1.2040939193257074, + "grad_norm": 4.233405590057373, + "learning_rate": 8.818715621549792e-07, + "loss": 0.3664, + "step": 10000 + }, + { + "epoch": 1.205298013245033, + "grad_norm": 4.029031753540039, + "learning_rate": 8.815322046713531e-07, + "loss": 0.3655, + "step": 10010 + }, + { + "epoch": 1.206502107164359, + "grad_norm": 4.398824691772461, + "learning_rate": 8.811924259312102e-07, + "loss": 0.3818, + "step": 10020 + }, + { + "epoch": 1.2077062010836845, + "grad_norm": 4.394994258880615, + "learning_rate": 8.808522263097063e-07, + "loss": 0.3875, + "step": 10030 + }, + { + "epoch": 1.2089102950030102, + "grad_norm": 4.941735744476318, + "learning_rate": 8.805116061824617e-07, + "loss": 0.3635, + "step": 10040 + }, + { + "epoch": 1.210114388922336, + "grad_norm": 4.183002471923828, + "learning_rate": 8.801705659255616e-07, + "loss": 0.3718, + "step": 10050 + }, + { + "epoch": 1.2113184828416617, + "grad_norm": 3.9239907264709473, + "learning_rate": 8.798291059155541e-07, + "loss": 0.3562, + "step": 10060 + }, + { + "epoch": 1.2125225767609873, + "grad_norm": 4.399021625518799, + "learning_rate": 8.794872265294516e-07, + "loss": 0.3577, + "step": 10070 + }, + { + "epoch": 1.2137266706803131, + "grad_norm": 3.739692211151123, + "learning_rate": 8.791449281447291e-07, + "loss": 0.3715, + "step": 10080 + }, + { + "epoch": 1.2149307645996388, + "grad_norm": 6.101430416107178, + "learning_rate": 8.788022111393245e-07, + "loss": 0.3791, + "step": 10090 + }, + { + "epoch": 1.2161348585189644, + "grad_norm": 4.473653793334961, + "learning_rate": 8.784590758916377e-07, + "loss": 0.3733, + "step": 10100 + }, + { + "epoch": 1.2173389524382903, + "grad_norm": 5.723465919494629, + "learning_rate": 8.781155227805304e-07, + "loss": 0.376, + "step": 10110 + }, + { + "epoch": 1.218543046357616, + "grad_norm": 6.045252323150635, + "learning_rate": 8.777715521853257e-07, + "loss": 0.383, + "step": 10120 + }, + { + "epoch": 1.2197471402769415, + "grad_norm": 4.978476524353027, + "learning_rate": 8.774271644858078e-07, + "loss": 0.3902, + "step": 10130 + }, + { + "epoch": 1.2209512341962674, + "grad_norm": 4.655144691467285, + "learning_rate": 8.770823600622212e-07, + "loss": 0.3832, + "step": 10140 + }, + { + "epoch": 1.222155328115593, + "grad_norm": 4.3407883644104, + "learning_rate": 8.767371392952708e-07, + "loss": 0.3582, + "step": 10150 + }, + { + "epoch": 1.2233594220349187, + "grad_norm": 4.6942596435546875, + "learning_rate": 8.763915025661206e-07, + "loss": 0.3755, + "step": 10160 + }, + { + "epoch": 1.2245635159542445, + "grad_norm": 4.285218715667725, + "learning_rate": 8.760454502563947e-07, + "loss": 0.3776, + "step": 10170 + }, + { + "epoch": 1.2257676098735701, + "grad_norm": 4.890243053436279, + "learning_rate": 8.756989827481755e-07, + "loss": 0.37, + "step": 10180 + }, + { + "epoch": 1.2269717037928958, + "grad_norm": 4.752533912658691, + "learning_rate": 8.753521004240038e-07, + "loss": 0.3717, + "step": 10190 + }, + { + "epoch": 1.2281757977122216, + "grad_norm": 4.077126502990723, + "learning_rate": 8.750048036668789e-07, + "loss": 0.3811, + "step": 10200 + }, + { + "epoch": 1.2293798916315473, + "grad_norm": 3.9369449615478516, + "learning_rate": 8.74657092860257e-07, + "loss": 0.3737, + "step": 10210 + }, + { + "epoch": 1.230583985550873, + "grad_norm": 4.381350040435791, + "learning_rate": 8.74308968388052e-07, + "loss": 0.3528, + "step": 10220 + }, + { + "epoch": 1.2317880794701987, + "grad_norm": 4.581336975097656, + "learning_rate": 8.739604306346342e-07, + "loss": 0.3728, + "step": 10230 + }, + { + "epoch": 1.2329921733895244, + "grad_norm": 5.837801933288574, + "learning_rate": 8.736114799848306e-07, + "loss": 0.3812, + "step": 10240 + }, + { + "epoch": 1.23419626730885, + "grad_norm": 4.347848892211914, + "learning_rate": 8.732621168239236e-07, + "loss": 0.3818, + "step": 10250 + }, + { + "epoch": 1.2354003612281759, + "grad_norm": 4.717700004577637, + "learning_rate": 8.729123415376514e-07, + "loss": 0.3516, + "step": 10260 + }, + { + "epoch": 1.2366044551475015, + "grad_norm": 4.809170722961426, + "learning_rate": 8.725621545122072e-07, + "loss": 0.3642, + "step": 10270 + }, + { + "epoch": 1.2378085490668271, + "grad_norm": 4.547823905944824, + "learning_rate": 8.722115561342387e-07, + "loss": 0.3791, + "step": 10280 + }, + { + "epoch": 1.239012642986153, + "grad_norm": 4.235891819000244, + "learning_rate": 8.718605467908478e-07, + "loss": 0.3663, + "step": 10290 + }, + { + "epoch": 1.2402167369054786, + "grad_norm": 4.648200035095215, + "learning_rate": 8.715091268695901e-07, + "loss": 0.3623, + "step": 10300 + }, + { + "epoch": 1.2414208308248043, + "grad_norm": 4.6003737449646, + "learning_rate": 8.711572967584747e-07, + "loss": 0.378, + "step": 10310 + }, + { + "epoch": 1.24262492474413, + "grad_norm": 4.921525001525879, + "learning_rate": 8.708050568459635e-07, + "loss": 0.3602, + "step": 10320 + }, + { + "epoch": 1.2438290186634557, + "grad_norm": 4.075355052947998, + "learning_rate": 8.704524075209709e-07, + "loss": 0.3698, + "step": 10330 + }, + { + "epoch": 1.2450331125827814, + "grad_norm": 5.707545280456543, + "learning_rate": 8.700993491728634e-07, + "loss": 0.3538, + "step": 10340 + }, + { + "epoch": 1.2462372065021072, + "grad_norm": 4.669870853424072, + "learning_rate": 8.697458821914587e-07, + "loss": 0.3685, + "step": 10350 + }, + { + "epoch": 1.2474413004214329, + "grad_norm": 4.101998329162598, + "learning_rate": 8.693920069670264e-07, + "loss": 0.3823, + "step": 10360 + }, + { + "epoch": 1.2486453943407585, + "grad_norm": 4.307315349578857, + "learning_rate": 8.690377238902862e-07, + "loss": 0.3718, + "step": 10370 + }, + { + "epoch": 1.2498494882600844, + "grad_norm": 4.498570442199707, + "learning_rate": 8.686830333524084e-07, + "loss": 0.3894, + "step": 10380 + }, + { + "epoch": 1.25105358217941, + "grad_norm": 4.348161697387695, + "learning_rate": 8.68327935745013e-07, + "loss": 0.3661, + "step": 10390 + }, + { + "epoch": 1.2522576760987358, + "grad_norm": 4.509785175323486, + "learning_rate": 8.679724314601701e-07, + "loss": 0.3691, + "step": 10400 + }, + { + "epoch": 1.2534617700180615, + "grad_norm": 4.251500606536865, + "learning_rate": 8.676165208903978e-07, + "loss": 0.3489, + "step": 10410 + }, + { + "epoch": 1.254665863937387, + "grad_norm": 3.91599702835083, + "learning_rate": 8.672602044286637e-07, + "loss": 0.3835, + "step": 10420 + }, + { + "epoch": 1.2558699578567127, + "grad_norm": 4.641791820526123, + "learning_rate": 8.66903482468383e-07, + "loss": 0.3676, + "step": 10430 + }, + { + "epoch": 1.2570740517760386, + "grad_norm": 6.0034499168396, + "learning_rate": 8.665463554034187e-07, + "loss": 0.3728, + "step": 10440 + }, + { + "epoch": 1.2582781456953642, + "grad_norm": 5.09488582611084, + "learning_rate": 8.661888236280813e-07, + "loss": 0.3718, + "step": 10450 + }, + { + "epoch": 1.25948223961469, + "grad_norm": 5.368484020233154, + "learning_rate": 8.658308875371279e-07, + "loss": 0.3908, + "step": 10460 + }, + { + "epoch": 1.2606863335340157, + "grad_norm": 5.200775623321533, + "learning_rate": 8.654725475257621e-07, + "loss": 0.3655, + "step": 10470 + }, + { + "epoch": 1.2618904274533413, + "grad_norm": 4.358388900756836, + "learning_rate": 8.651138039896338e-07, + "loss": 0.3748, + "step": 10480 + }, + { + "epoch": 1.263094521372667, + "grad_norm": 4.452842712402344, + "learning_rate": 8.647546573248377e-07, + "loss": 0.3731, + "step": 10490 + }, + { + "epoch": 1.2642986152919928, + "grad_norm": 4.0504584312438965, + "learning_rate": 8.643951079279144e-07, + "loss": 0.3767, + "step": 10500 + }, + { + "epoch": 1.2655027092113185, + "grad_norm": 5.186153411865234, + "learning_rate": 8.640351561958486e-07, + "loss": 0.362, + "step": 10510 + }, + { + "epoch": 1.2667068031306443, + "grad_norm": 4.57370662689209, + "learning_rate": 8.636748025260696e-07, + "loss": 0.3766, + "step": 10520 + }, + { + "epoch": 1.26791089704997, + "grad_norm": 5.416035175323486, + "learning_rate": 8.633140473164502e-07, + "loss": 0.3653, + "step": 10530 + }, + { + "epoch": 1.2691149909692956, + "grad_norm": 4.351581573486328, + "learning_rate": 8.629528909653065e-07, + "loss": 0.3556, + "step": 10540 + }, + { + "epoch": 1.2703190848886212, + "grad_norm": 5.305721759796143, + "learning_rate": 8.625913338713982e-07, + "loss": 0.3873, + "step": 10550 + }, + { + "epoch": 1.271523178807947, + "grad_norm": 3.8972630500793457, + "learning_rate": 8.622293764339264e-07, + "loss": 0.3812, + "step": 10560 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 5.005763530731201, + "learning_rate": 8.61867019052535e-07, + "loss": 0.3761, + "step": 10570 + }, + { + "epoch": 1.2739313666465986, + "grad_norm": 4.1513848304748535, + "learning_rate": 8.615042621273093e-07, + "loss": 0.3525, + "step": 10580 + }, + { + "epoch": 1.2751354605659242, + "grad_norm": 5.166493892669678, + "learning_rate": 8.611411060587757e-07, + "loss": 0.3866, + "step": 10590 + }, + { + "epoch": 1.2763395544852498, + "grad_norm": 4.168553352355957, + "learning_rate": 8.60777551247901e-07, + "loss": 0.3735, + "step": 10600 + }, + { + "epoch": 1.2775436484045755, + "grad_norm": 4.891838550567627, + "learning_rate": 8.60413598096093e-07, + "loss": 0.3603, + "step": 10610 + }, + { + "epoch": 1.2787477423239013, + "grad_norm": 4.317160606384277, + "learning_rate": 8.600492470051983e-07, + "loss": 0.3765, + "step": 10620 + }, + { + "epoch": 1.279951836243227, + "grad_norm": 4.056015968322754, + "learning_rate": 8.59684498377504e-07, + "loss": 0.3704, + "step": 10630 + }, + { + "epoch": 1.2811559301625528, + "grad_norm": 4.8416242599487305, + "learning_rate": 8.593193526157354e-07, + "loss": 0.3475, + "step": 10640 + }, + { + "epoch": 1.2823600240818784, + "grad_norm": 5.178276062011719, + "learning_rate": 8.589538101230564e-07, + "loss": 0.3823, + "step": 10650 + }, + { + "epoch": 1.283564118001204, + "grad_norm": 4.507132053375244, + "learning_rate": 8.58587871303069e-07, + "loss": 0.3597, + "step": 10660 + }, + { + "epoch": 1.2847682119205297, + "grad_norm": 4.44130277633667, + "learning_rate": 8.582215365598127e-07, + "loss": 0.3748, + "step": 10670 + }, + { + "epoch": 1.2859723058398556, + "grad_norm": 4.559373378753662, + "learning_rate": 8.578548062977644e-07, + "loss": 0.3684, + "step": 10680 + }, + { + "epoch": 1.2871763997591812, + "grad_norm": 4.59391450881958, + "learning_rate": 8.574876809218374e-07, + "loss": 0.3729, + "step": 10690 + }, + { + "epoch": 1.288380493678507, + "grad_norm": 4.64610481262207, + "learning_rate": 8.571201608373815e-07, + "loss": 0.367, + "step": 10700 + }, + { + "epoch": 1.2895845875978327, + "grad_norm": 5.637624740600586, + "learning_rate": 8.56752246450182e-07, + "loss": 0.3799, + "step": 10710 + }, + { + "epoch": 1.2907886815171583, + "grad_norm": 4.1183271408081055, + "learning_rate": 8.563839381664599e-07, + "loss": 0.3744, + "step": 10720 + }, + { + "epoch": 1.291992775436484, + "grad_norm": 5.679279327392578, + "learning_rate": 8.560152363928709e-07, + "loss": 0.3636, + "step": 10730 + }, + { + "epoch": 1.2931968693558098, + "grad_norm": 4.73154878616333, + "learning_rate": 8.556461415365052e-07, + "loss": 0.3772, + "step": 10740 + }, + { + "epoch": 1.2944009632751354, + "grad_norm": 4.206639289855957, + "learning_rate": 8.552766540048871e-07, + "loss": 0.3652, + "step": 10750 + }, + { + "epoch": 1.2956050571944613, + "grad_norm": 4.551361083984375, + "learning_rate": 8.549067742059741e-07, + "loss": 0.36, + "step": 10760 + }, + { + "epoch": 1.296809151113787, + "grad_norm": 4.472609043121338, + "learning_rate": 8.545365025481574e-07, + "loss": 0.3949, + "step": 10770 + }, + { + "epoch": 1.2980132450331126, + "grad_norm": 3.9386298656463623, + "learning_rate": 8.541658394402605e-07, + "loss": 0.3736, + "step": 10780 + }, + { + "epoch": 1.2992173389524382, + "grad_norm": 5.128427505493164, + "learning_rate": 8.537947852915388e-07, + "loss": 0.3708, + "step": 10790 + }, + { + "epoch": 1.300421432871764, + "grad_norm": 4.362430095672607, + "learning_rate": 8.534233405116804e-07, + "loss": 0.3707, + "step": 10800 + }, + { + "epoch": 1.3016255267910897, + "grad_norm": 5.032322883605957, + "learning_rate": 8.530515055108036e-07, + "loss": 0.3694, + "step": 10810 + }, + { + "epoch": 1.3028296207104155, + "grad_norm": 3.745659828186035, + "learning_rate": 8.526792806994585e-07, + "loss": 0.3531, + "step": 10820 + }, + { + "epoch": 1.3040337146297412, + "grad_norm": 3.8410699367523193, + "learning_rate": 8.523066664886248e-07, + "loss": 0.3591, + "step": 10830 + }, + { + "epoch": 1.3052378085490668, + "grad_norm": 6.065695285797119, + "learning_rate": 8.519336632897128e-07, + "loss": 0.3748, + "step": 10840 + }, + { + "epoch": 1.3064419024683924, + "grad_norm": 4.5033464431762695, + "learning_rate": 8.515602715145615e-07, + "loss": 0.3661, + "step": 10850 + }, + { + "epoch": 1.3076459963877183, + "grad_norm": 4.6679558753967285, + "learning_rate": 8.511864915754399e-07, + "loss": 0.3835, + "step": 10860 + }, + { + "epoch": 1.308850090307044, + "grad_norm": 4.266571998596191, + "learning_rate": 8.50812323885045e-07, + "loss": 0.3799, + "step": 10870 + }, + { + "epoch": 1.3100541842263698, + "grad_norm": 4.90196418762207, + "learning_rate": 8.504377688565019e-07, + "loss": 0.3551, + "step": 10880 + }, + { + "epoch": 1.3112582781456954, + "grad_norm": 4.301276683807373, + "learning_rate": 8.500628269033635e-07, + "loss": 0.3825, + "step": 10890 + }, + { + "epoch": 1.312462372065021, + "grad_norm": 4.9276580810546875, + "learning_rate": 8.4968749843961e-07, + "loss": 0.37, + "step": 10900 + }, + { + "epoch": 1.3136664659843467, + "grad_norm": 4.929906845092773, + "learning_rate": 8.493117838796482e-07, + "loss": 0.3751, + "step": 10910 + }, + { + "epoch": 1.3148705599036725, + "grad_norm": 4.179794788360596, + "learning_rate": 8.489356836383112e-07, + "loss": 0.3714, + "step": 10920 + }, + { + "epoch": 1.3160746538229982, + "grad_norm": 4.671365261077881, + "learning_rate": 8.485591981308583e-07, + "loss": 0.3665, + "step": 10930 + }, + { + "epoch": 1.317278747742324, + "grad_norm": 4.073710918426514, + "learning_rate": 8.481823277729734e-07, + "loss": 0.3602, + "step": 10940 + }, + { + "epoch": 1.3184828416616496, + "grad_norm": 4.633068084716797, + "learning_rate": 8.478050729807663e-07, + "loss": 0.3682, + "step": 10950 + }, + { + "epoch": 1.3196869355809753, + "grad_norm": 5.233600616455078, + "learning_rate": 8.474274341707701e-07, + "loss": 0.3781, + "step": 10960 + }, + { + "epoch": 1.320891029500301, + "grad_norm": 4.329504013061523, + "learning_rate": 8.470494117599431e-07, + "loss": 0.3763, + "step": 10970 + }, + { + "epoch": 1.3220951234196268, + "grad_norm": 4.211668968200684, + "learning_rate": 8.466710061656664e-07, + "loss": 0.3325, + "step": 10980 + }, + { + "epoch": 1.3232992173389524, + "grad_norm": 4.388267993927002, + "learning_rate": 8.462922178057443e-07, + "loss": 0.3709, + "step": 10990 + }, + { + "epoch": 1.3245033112582782, + "grad_norm": 5.167718887329102, + "learning_rate": 8.45913047098404e-07, + "loss": 0.362, + "step": 11000 + }, + { + "epoch": 1.3257074051776039, + "grad_norm": 4.614595890045166, + "learning_rate": 8.455334944622945e-07, + "loss": 0.3549, + "step": 11010 + }, + { + "epoch": 1.3269114990969295, + "grad_norm": 4.618056774139404, + "learning_rate": 8.451535603164864e-07, + "loss": 0.3773, + "step": 11020 + }, + { + "epoch": 1.3281155930162551, + "grad_norm": 4.563729763031006, + "learning_rate": 8.447732450804723e-07, + "loss": 0.3688, + "step": 11030 + }, + { + "epoch": 1.329319686935581, + "grad_norm": 4.429327011108398, + "learning_rate": 8.443925491741646e-07, + "loss": 0.3429, + "step": 11040 + }, + { + "epoch": 1.3305237808549066, + "grad_norm": 4.474249362945557, + "learning_rate": 8.440114730178966e-07, + "loss": 0.3879, + "step": 11050 + }, + { + "epoch": 1.3317278747742325, + "grad_norm": 4.212963581085205, + "learning_rate": 8.436300170324215e-07, + "loss": 0.349, + "step": 11060 + }, + { + "epoch": 1.3329319686935581, + "grad_norm": 4.393470287322998, + "learning_rate": 8.432481816389112e-07, + "loss": 0.3609, + "step": 11070 + }, + { + "epoch": 1.3341360626128838, + "grad_norm": 4.512639045715332, + "learning_rate": 8.428659672589574e-07, + "loss": 0.3446, + "step": 11080 + }, + { + "epoch": 1.3353401565322094, + "grad_norm": 5.399291515350342, + "learning_rate": 8.424833743145696e-07, + "loss": 0.3643, + "step": 11090 + }, + { + "epoch": 1.3365442504515352, + "grad_norm": 4.692162990570068, + "learning_rate": 8.421004032281756e-07, + "loss": 0.3782, + "step": 11100 + }, + { + "epoch": 1.3377483443708609, + "grad_norm": 4.4849677085876465, + "learning_rate": 8.417170544226203e-07, + "loss": 0.36, + "step": 11110 + }, + { + "epoch": 1.3389524382901867, + "grad_norm": 4.692328453063965, + "learning_rate": 8.413333283211664e-07, + "loss": 0.3626, + "step": 11120 + }, + { + "epoch": 1.3401565322095124, + "grad_norm": 4.903812408447266, + "learning_rate": 8.409492253474925e-07, + "loss": 0.3576, + "step": 11130 + }, + { + "epoch": 1.341360626128838, + "grad_norm": 4.484142780303955, + "learning_rate": 8.405647459256937e-07, + "loss": 0.3611, + "step": 11140 + }, + { + "epoch": 1.3425647200481636, + "grad_norm": 4.777652263641357, + "learning_rate": 8.401798904802804e-07, + "loss": 0.3654, + "step": 11150 + }, + { + "epoch": 1.3437688139674895, + "grad_norm": 4.49363374710083, + "learning_rate": 8.397946594361785e-07, + "loss": 0.3684, + "step": 11160 + }, + { + "epoch": 1.3449729078868151, + "grad_norm": 5.207254886627197, + "learning_rate": 8.394090532187284e-07, + "loss": 0.3706, + "step": 11170 + }, + { + "epoch": 1.346177001806141, + "grad_norm": 5.246047496795654, + "learning_rate": 8.390230722536849e-07, + "loss": 0.365, + "step": 11180 + }, + { + "epoch": 1.3473810957254666, + "grad_norm": 4.5202317237854, + "learning_rate": 8.386367169672164e-07, + "loss": 0.3549, + "step": 11190 + }, + { + "epoch": 1.3485851896447922, + "grad_norm": 5.0257368087768555, + "learning_rate": 8.382499877859046e-07, + "loss": 0.3765, + "step": 11200 + }, + { + "epoch": 1.3497892835641179, + "grad_norm": 3.513502597808838, + "learning_rate": 8.378628851367441e-07, + "loss": 0.3435, + "step": 11210 + }, + { + "epoch": 1.3509933774834437, + "grad_norm": 4.943020820617676, + "learning_rate": 8.374754094471421e-07, + "loss": 0.3754, + "step": 11220 + }, + { + "epoch": 1.3521974714027694, + "grad_norm": 4.6621012687683105, + "learning_rate": 8.37087561144917e-07, + "loss": 0.3823, + "step": 11230 + }, + { + "epoch": 1.3534015653220952, + "grad_norm": 3.8831217288970947, + "learning_rate": 8.366993406582996e-07, + "loss": 0.3606, + "step": 11240 + }, + { + "epoch": 1.3546056592414208, + "grad_norm": 4.315981388092041, + "learning_rate": 8.363107484159305e-07, + "loss": 0.3647, + "step": 11250 + }, + { + "epoch": 1.3558097531607465, + "grad_norm": 4.6641011238098145, + "learning_rate": 8.359217848468616e-07, + "loss": 0.377, + "step": 11260 + }, + { + "epoch": 1.357013847080072, + "grad_norm": 4.609387397766113, + "learning_rate": 8.355324503805545e-07, + "loss": 0.369, + "step": 11270 + }, + { + "epoch": 1.358217940999398, + "grad_norm": 4.37289571762085, + "learning_rate": 8.351427454468805e-07, + "loss": 0.3594, + "step": 11280 + }, + { + "epoch": 1.3594220349187236, + "grad_norm": 5.407008171081543, + "learning_rate": 8.347526704761192e-07, + "loss": 0.3732, + "step": 11290 + }, + { + "epoch": 1.3606261288380495, + "grad_norm": 4.5802083015441895, + "learning_rate": 8.3436222589896e-07, + "loss": 0.3506, + "step": 11300 + }, + { + "epoch": 1.361830222757375, + "grad_norm": 4.10429048538208, + "learning_rate": 8.339714121464994e-07, + "loss": 0.3917, + "step": 11310 + }, + { + "epoch": 1.3630343166767007, + "grad_norm": 4.250566005706787, + "learning_rate": 8.335802296502419e-07, + "loss": 0.3515, + "step": 11320 + }, + { + "epoch": 1.3642384105960264, + "grad_norm": 5.012816429138184, + "learning_rate": 8.33188678842099e-07, + "loss": 0.354, + "step": 11330 + }, + { + "epoch": 1.3654425045153522, + "grad_norm": 4.53849983215332, + "learning_rate": 8.327967601543891e-07, + "loss": 0.3612, + "step": 11340 + }, + { + "epoch": 1.3666465984346778, + "grad_norm": 4.784470081329346, + "learning_rate": 8.324044740198364e-07, + "loss": 0.356, + "step": 11350 + }, + { + "epoch": 1.3678506923540037, + "grad_norm": 4.100750923156738, + "learning_rate": 8.320118208715714e-07, + "loss": 0.3769, + "step": 11360 + }, + { + "epoch": 1.3690547862733293, + "grad_norm": 5.738262176513672, + "learning_rate": 8.316188011431291e-07, + "loss": 0.3797, + "step": 11370 + }, + { + "epoch": 1.370258880192655, + "grad_norm": 4.102308750152588, + "learning_rate": 8.312254152684495e-07, + "loss": 0.3723, + "step": 11380 + }, + { + "epoch": 1.3714629741119808, + "grad_norm": 3.786195993423462, + "learning_rate": 8.308316636818773e-07, + "loss": 0.3638, + "step": 11390 + }, + { + "epoch": 1.3726670680313064, + "grad_norm": 4.1659159660339355, + "learning_rate": 8.304375468181606e-07, + "loss": 0.3487, + "step": 11400 + }, + { + "epoch": 1.373871161950632, + "grad_norm": 4.081630229949951, + "learning_rate": 8.300430651124505e-07, + "loss": 0.3602, + "step": 11410 + }, + { + "epoch": 1.375075255869958, + "grad_norm": 4.725644111633301, + "learning_rate": 8.296482190003019e-07, + "loss": 0.3746, + "step": 11420 + }, + { + "epoch": 1.3762793497892836, + "grad_norm": 4.421098709106445, + "learning_rate": 8.292530089176709e-07, + "loss": 0.3632, + "step": 11430 + }, + { + "epoch": 1.3774834437086092, + "grad_norm": 4.213558197021484, + "learning_rate": 8.288574353009164e-07, + "loss": 0.3748, + "step": 11440 + }, + { + "epoch": 1.378687537627935, + "grad_norm": 5.2602458000183105, + "learning_rate": 8.284614985867979e-07, + "loss": 0.355, + "step": 11450 + }, + { + "epoch": 1.3798916315472607, + "grad_norm": 4.735654354095459, + "learning_rate": 8.280651992124766e-07, + "loss": 0.3619, + "step": 11460 + }, + { + "epoch": 1.3810957254665863, + "grad_norm": 5.071203708648682, + "learning_rate": 8.276685376155133e-07, + "loss": 0.3693, + "step": 11470 + }, + { + "epoch": 1.3822998193859122, + "grad_norm": 4.431037902832031, + "learning_rate": 8.272715142338694e-07, + "loss": 0.3652, + "step": 11480 + }, + { + "epoch": 1.3835039133052378, + "grad_norm": 4.460841178894043, + "learning_rate": 8.268741295059056e-07, + "loss": 0.3732, + "step": 11490 + }, + { + "epoch": 1.3847080072245634, + "grad_norm": 5.048714637756348, + "learning_rate": 8.264763838703812e-07, + "loss": 0.364, + "step": 11500 + }, + { + "epoch": 1.3859121011438893, + "grad_norm": 4.322780132293701, + "learning_rate": 8.260782777664544e-07, + "loss": 0.3606, + "step": 11510 + }, + { + "epoch": 1.387116195063215, + "grad_norm": 4.763073921203613, + "learning_rate": 8.256798116336813e-07, + "loss": 0.3885, + "step": 11520 + }, + { + "epoch": 1.3883202889825406, + "grad_norm": 4.54296350479126, + "learning_rate": 8.252809859120153e-07, + "loss": 0.3629, + "step": 11530 + }, + { + "epoch": 1.3895243829018664, + "grad_norm": 4.481988430023193, + "learning_rate": 8.248818010418073e-07, + "loss": 0.3641, + "step": 11540 + }, + { + "epoch": 1.390728476821192, + "grad_norm": 4.431914806365967, + "learning_rate": 8.244822574638041e-07, + "loss": 0.3591, + "step": 11550 + }, + { + "epoch": 1.3919325707405177, + "grad_norm": 4.374257564544678, + "learning_rate": 8.240823556191489e-07, + "loss": 0.3634, + "step": 11560 + }, + { + "epoch": 1.3931366646598435, + "grad_norm": 3.9488606452941895, + "learning_rate": 8.23682095949381e-07, + "loss": 0.3466, + "step": 11570 + }, + { + "epoch": 1.3943407585791692, + "grad_norm": 4.069718837738037, + "learning_rate": 8.232814788964336e-07, + "loss": 0.3286, + "step": 11580 + }, + { + "epoch": 1.3955448524984948, + "grad_norm": 4.749855995178223, + "learning_rate": 8.228805049026355e-07, + "loss": 0.3546, + "step": 11590 + }, + { + "epoch": 1.3967489464178207, + "grad_norm": 3.9409117698669434, + "learning_rate": 8.224791744107089e-07, + "loss": 0.3663, + "step": 11600 + }, + { + "epoch": 1.3979530403371463, + "grad_norm": 4.028295993804932, + "learning_rate": 8.220774878637704e-07, + "loss": 0.3705, + "step": 11610 + }, + { + "epoch": 1.399157134256472, + "grad_norm": 4.911005973815918, + "learning_rate": 8.21675445705329e-07, + "loss": 0.3691, + "step": 11620 + }, + { + "epoch": 1.4003612281757978, + "grad_norm": 4.403053283691406, + "learning_rate": 8.212730483792868e-07, + "loss": 0.3736, + "step": 11630 + }, + { + "epoch": 1.4015653220951234, + "grad_norm": 4.316033840179443, + "learning_rate": 8.208702963299376e-07, + "loss": 0.373, + "step": 11640 + }, + { + "epoch": 1.402769416014449, + "grad_norm": 5.129039764404297, + "learning_rate": 8.204671900019676e-07, + "loss": 0.37, + "step": 11650 + }, + { + "epoch": 1.403973509933775, + "grad_norm": 4.374388694763184, + "learning_rate": 8.200637298404531e-07, + "loss": 0.3621, + "step": 11660 + }, + { + "epoch": 1.4051776038531005, + "grad_norm": 4.385969161987305, + "learning_rate": 8.19659916290862e-07, + "loss": 0.3744, + "step": 11670 + }, + { + "epoch": 1.4063816977724262, + "grad_norm": 4.8797149658203125, + "learning_rate": 8.192557497990521e-07, + "loss": 0.3554, + "step": 11680 + }, + { + "epoch": 1.407585791691752, + "grad_norm": 3.9471144676208496, + "learning_rate": 8.188512308112707e-07, + "loss": 0.3702, + "step": 11690 + }, + { + "epoch": 1.4087898856110777, + "grad_norm": 4.70519495010376, + "learning_rate": 8.184463597741544e-07, + "loss": 0.3422, + "step": 11700 + }, + { + "epoch": 1.4099939795304035, + "grad_norm": 5.044809818267822, + "learning_rate": 8.180411371347287e-07, + "loss": 0.3702, + "step": 11710 + }, + { + "epoch": 1.4111980734497291, + "grad_norm": 4.174710273742676, + "learning_rate": 8.17635563340407e-07, + "loss": 0.3513, + "step": 11720 + }, + { + "epoch": 1.4124021673690548, + "grad_norm": 4.635099411010742, + "learning_rate": 8.172296388389907e-07, + "loss": 0.3779, + "step": 11730 + }, + { + "epoch": 1.4136062612883804, + "grad_norm": 5.230491638183594, + "learning_rate": 8.168233640786682e-07, + "loss": 0.3601, + "step": 11740 + }, + { + "epoch": 1.4148103552077063, + "grad_norm": 4.704545497894287, + "learning_rate": 8.164167395080149e-07, + "loss": 0.3569, + "step": 11750 + }, + { + "epoch": 1.416014449127032, + "grad_norm": 4.232817649841309, + "learning_rate": 8.160097655759917e-07, + "loss": 0.374, + "step": 11760 + }, + { + "epoch": 1.4172185430463577, + "grad_norm": 5.304251670837402, + "learning_rate": 8.156024427319463e-07, + "loss": 0.3668, + "step": 11770 + }, + { + "epoch": 1.4184226369656834, + "grad_norm": 4.5971245765686035, + "learning_rate": 8.151947714256111e-07, + "loss": 0.3778, + "step": 11780 + }, + { + "epoch": 1.419626730885009, + "grad_norm": 4.492901802062988, + "learning_rate": 8.14786752107103e-07, + "loss": 0.3418, + "step": 11790 + }, + { + "epoch": 1.4208308248043346, + "grad_norm": 4.80876350402832, + "learning_rate": 8.143783852269237e-07, + "loss": 0.3633, + "step": 11800 + }, + { + "epoch": 1.4220349187236605, + "grad_norm": 3.9827497005462646, + "learning_rate": 8.13969671235958e-07, + "loss": 0.3649, + "step": 11810 + }, + { + "epoch": 1.4232390126429861, + "grad_norm": 4.20520544052124, + "learning_rate": 8.135606105854747e-07, + "loss": 0.3495, + "step": 11820 + }, + { + "epoch": 1.424443106562312, + "grad_norm": 4.29602575302124, + "learning_rate": 8.131512037271247e-07, + "loss": 0.3678, + "step": 11830 + }, + { + "epoch": 1.4256472004816376, + "grad_norm": 4.648280143737793, + "learning_rate": 8.127414511129416e-07, + "loss": 0.3789, + "step": 11840 + }, + { + "epoch": 1.4268512944009633, + "grad_norm": 4.162654399871826, + "learning_rate": 8.123313531953404e-07, + "loss": 0.372, + "step": 11850 + }, + { + "epoch": 1.4280553883202889, + "grad_norm": 4.688777446746826, + "learning_rate": 8.119209104271176e-07, + "loss": 0.3576, + "step": 11860 + }, + { + "epoch": 1.4292594822396147, + "grad_norm": 4.464323997497559, + "learning_rate": 8.115101232614506e-07, + "loss": 0.3817, + "step": 11870 + }, + { + "epoch": 1.4304635761589404, + "grad_norm": 4.280879974365234, + "learning_rate": 8.110989921518965e-07, + "loss": 0.3604, + "step": 11880 + }, + { + "epoch": 1.4316676700782662, + "grad_norm": 3.778425693511963, + "learning_rate": 8.106875175523926e-07, + "loss": 0.3553, + "step": 11890 + }, + { + "epoch": 1.4328717639975919, + "grad_norm": 4.960265159606934, + "learning_rate": 8.102756999172554e-07, + "loss": 0.3723, + "step": 11900 + }, + { + "epoch": 1.4340758579169175, + "grad_norm": 4.935343265533447, + "learning_rate": 8.098635397011802e-07, + "loss": 0.3714, + "step": 11910 + }, + { + "epoch": 1.4352799518362431, + "grad_norm": 4.417319297790527, + "learning_rate": 8.094510373592402e-07, + "loss": 0.3612, + "step": 11920 + }, + { + "epoch": 1.436484045755569, + "grad_norm": 4.819094181060791, + "learning_rate": 8.090381933468868e-07, + "loss": 0.3602, + "step": 11930 + }, + { + "epoch": 1.4376881396748946, + "grad_norm": 4.769229888916016, + "learning_rate": 8.086250081199484e-07, + "loss": 0.3597, + "step": 11940 + }, + { + "epoch": 1.4388922335942205, + "grad_norm": 4.872611999511719, + "learning_rate": 8.082114821346302e-07, + "loss": 0.3698, + "step": 11950 + }, + { + "epoch": 1.440096327513546, + "grad_norm": 4.3483686447143555, + "learning_rate": 8.077976158475135e-07, + "loss": 0.366, + "step": 11960 + }, + { + "epoch": 1.4413004214328717, + "grad_norm": 4.28345251083374, + "learning_rate": 8.073834097155555e-07, + "loss": 0.3564, + "step": 11970 + }, + { + "epoch": 1.4425045153521974, + "grad_norm": 4.1988606452941895, + "learning_rate": 8.069688641960888e-07, + "loss": 0.3557, + "step": 11980 + }, + { + "epoch": 1.4437086092715232, + "grad_norm": 4.156854152679443, + "learning_rate": 8.065539797468201e-07, + "loss": 0.3631, + "step": 11990 + }, + { + "epoch": 1.4449127031908489, + "grad_norm": 5.002780914306641, + "learning_rate": 8.061387568258312e-07, + "loss": 0.362, + "step": 12000 + }, + { + "epoch": 1.4461167971101747, + "grad_norm": 4.551509380340576, + "learning_rate": 8.057231958915767e-07, + "loss": 0.3545, + "step": 12010 + }, + { + "epoch": 1.4473208910295003, + "grad_norm": 3.529510498046875, + "learning_rate": 8.053072974028851e-07, + "loss": 0.3698, + "step": 12020 + }, + { + "epoch": 1.448524984948826, + "grad_norm": 5.073483467102051, + "learning_rate": 8.048910618189573e-07, + "loss": 0.3762, + "step": 12030 + }, + { + "epoch": 1.4497290788681516, + "grad_norm": 4.148519992828369, + "learning_rate": 8.044744895993665e-07, + "loss": 0.3714, + "step": 12040 + }, + { + "epoch": 1.4509331727874775, + "grad_norm": 5.03234338760376, + "learning_rate": 8.040575812040574e-07, + "loss": 0.3651, + "step": 12050 + }, + { + "epoch": 1.452137266706803, + "grad_norm": 4.286599159240723, + "learning_rate": 8.03640337093346e-07, + "loss": 0.3646, + "step": 12060 + }, + { + "epoch": 1.453341360626129, + "grad_norm": 5.805792808532715, + "learning_rate": 8.03222757727919e-07, + "loss": 0.3662, + "step": 12070 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 5.614697456359863, + "learning_rate": 8.028048435688333e-07, + "loss": 0.3661, + "step": 12080 + }, + { + "epoch": 1.4557495484647802, + "grad_norm": 4.117318630218506, + "learning_rate": 8.023865950775153e-07, + "loss": 0.3611, + "step": 12090 + }, + { + "epoch": 1.4569536423841059, + "grad_norm": 4.437227249145508, + "learning_rate": 8.019680127157606e-07, + "loss": 0.3551, + "step": 12100 + }, + { + "epoch": 1.4581577363034317, + "grad_norm": 4.852316856384277, + "learning_rate": 8.015490969457337e-07, + "loss": 0.3738, + "step": 12110 + }, + { + "epoch": 1.4593618302227573, + "grad_norm": 4.06812047958374, + "learning_rate": 8.011298482299666e-07, + "loss": 0.3535, + "step": 12120 + }, + { + "epoch": 1.4605659241420832, + "grad_norm": 4.921239376068115, + "learning_rate": 8.007102670313595e-07, + "loss": 0.3586, + "step": 12130 + }, + { + "epoch": 1.4617700180614088, + "grad_norm": 3.9317848682403564, + "learning_rate": 8.002903538131794e-07, + "loss": 0.3527, + "step": 12140 + }, + { + "epoch": 1.4629741119807345, + "grad_norm": 5.692650318145752, + "learning_rate": 7.998701090390601e-07, + "loss": 0.364, + "step": 12150 + }, + { + "epoch": 1.46417820590006, + "grad_norm": 4.238543510437012, + "learning_rate": 7.994495331730013e-07, + "loss": 0.3516, + "step": 12160 + }, + { + "epoch": 1.465382299819386, + "grad_norm": 4.356393814086914, + "learning_rate": 7.990286266793685e-07, + "loss": 0.3464, + "step": 12170 + }, + { + "epoch": 1.4665863937387116, + "grad_norm": 4.616797924041748, + "learning_rate": 7.986073900228916e-07, + "loss": 0.3465, + "step": 12180 + }, + { + "epoch": 1.4677904876580374, + "grad_norm": 3.8541862964630127, + "learning_rate": 7.981858236686661e-07, + "loss": 0.3546, + "step": 12190 + }, + { + "epoch": 1.468994581577363, + "grad_norm": 5.685515880584717, + "learning_rate": 7.977639280821505e-07, + "loss": 0.3563, + "step": 12200 + }, + { + "epoch": 1.4701986754966887, + "grad_norm": 4.1002702713012695, + "learning_rate": 7.973417037291672e-07, + "loss": 0.3771, + "step": 12210 + }, + { + "epoch": 1.4714027694160143, + "grad_norm": 4.752336025238037, + "learning_rate": 7.969191510759019e-07, + "loss": 0.366, + "step": 12220 + }, + { + "epoch": 1.4726068633353402, + "grad_norm": 4.7561774253845215, + "learning_rate": 7.964962705889027e-07, + "loss": 0.3621, + "step": 12230 + }, + { + "epoch": 1.4738109572546658, + "grad_norm": 4.569270133972168, + "learning_rate": 7.96073062735079e-07, + "loss": 0.3662, + "step": 12240 + }, + { + "epoch": 1.4750150511739917, + "grad_norm": 3.9785332679748535, + "learning_rate": 7.956495279817025e-07, + "loss": 0.3711, + "step": 12250 + }, + { + "epoch": 1.4762191450933173, + "grad_norm": 4.953578948974609, + "learning_rate": 7.952256667964053e-07, + "loss": 0.3671, + "step": 12260 + }, + { + "epoch": 1.477423239012643, + "grad_norm": 4.805257320404053, + "learning_rate": 7.948014796471802e-07, + "loss": 0.3707, + "step": 12270 + }, + { + "epoch": 1.4786273329319686, + "grad_norm": 4.094834804534912, + "learning_rate": 7.943769670023799e-07, + "loss": 0.3699, + "step": 12280 + }, + { + "epoch": 1.4798314268512944, + "grad_norm": 5.696323394775391, + "learning_rate": 7.939521293307161e-07, + "loss": 0.3753, + "step": 12290 + }, + { + "epoch": 1.48103552077062, + "grad_norm": 4.848500728607178, + "learning_rate": 7.935269671012599e-07, + "loss": 0.3643, + "step": 12300 + }, + { + "epoch": 1.482239614689946, + "grad_norm": 4.916533946990967, + "learning_rate": 7.931014807834404e-07, + "loss": 0.3621, + "step": 12310 + }, + { + "epoch": 1.4834437086092715, + "grad_norm": 4.234400272369385, + "learning_rate": 7.926756708470447e-07, + "loss": 0.3464, + "step": 12320 + }, + { + "epoch": 1.4846478025285972, + "grad_norm": 4.844507217407227, + "learning_rate": 7.922495377622171e-07, + "loss": 0.3535, + "step": 12330 + }, + { + "epoch": 1.4858518964479228, + "grad_norm": 5.471369743347168, + "learning_rate": 7.918230819994588e-07, + "loss": 0.3592, + "step": 12340 + }, + { + "epoch": 1.4870559903672487, + "grad_norm": 5.131628036499023, + "learning_rate": 7.913963040296272e-07, + "loss": 0.376, + "step": 12350 + }, + { + "epoch": 1.4882600842865743, + "grad_norm": 4.308112144470215, + "learning_rate": 7.909692043239353e-07, + "loss": 0.3526, + "step": 12360 + }, + { + "epoch": 1.4894641782059002, + "grad_norm": 4.5161356925964355, + "learning_rate": 7.905417833539518e-07, + "loss": 0.3548, + "step": 12370 + }, + { + "epoch": 1.4906682721252258, + "grad_norm": 4.657468795776367, + "learning_rate": 7.901140415915995e-07, + "loss": 0.3727, + "step": 12380 + }, + { + "epoch": 1.4918723660445514, + "grad_norm": 4.615851879119873, + "learning_rate": 7.896859795091562e-07, + "loss": 0.3728, + "step": 12390 + }, + { + "epoch": 1.493076459963877, + "grad_norm": 3.6912169456481934, + "learning_rate": 7.892575975792523e-07, + "loss": 0.3646, + "step": 12400 + }, + { + "epoch": 1.494280553883203, + "grad_norm": 4.871870517730713, + "learning_rate": 7.888288962748723e-07, + "loss": 0.3416, + "step": 12410 + }, + { + "epoch": 1.4954846478025285, + "grad_norm": 4.7089385986328125, + "learning_rate": 7.883998760693529e-07, + "loss": 0.3883, + "step": 12420 + }, + { + "epoch": 1.4966887417218544, + "grad_norm": 4.376954078674316, + "learning_rate": 7.87970537436383e-07, + "loss": 0.3427, + "step": 12430 + }, + { + "epoch": 1.49789283564118, + "grad_norm": 4.280700206756592, + "learning_rate": 7.875408808500028e-07, + "loss": 0.3651, + "step": 12440 + }, + { + "epoch": 1.4990969295605057, + "grad_norm": 4.794469356536865, + "learning_rate": 7.871109067846041e-07, + "loss": 0.3731, + "step": 12450 + }, + { + "epoch": 1.5003010234798313, + "grad_norm": 4.945312023162842, + "learning_rate": 7.86680615714929e-07, + "loss": 0.3586, + "step": 12460 + }, + { + "epoch": 1.5015051173991572, + "grad_norm": 3.5225555896759033, + "learning_rate": 7.862500081160692e-07, + "loss": 0.3595, + "step": 12470 + }, + { + "epoch": 1.502709211318483, + "grad_norm": 4.152462005615234, + "learning_rate": 7.858190844634664e-07, + "loss": 0.3777, + "step": 12480 + }, + { + "epoch": 1.5039133052378086, + "grad_norm": 5.704073905944824, + "learning_rate": 7.853878452329113e-07, + "loss": 0.375, + "step": 12490 + }, + { + "epoch": 1.5051173991571343, + "grad_norm": 5.431835174560547, + "learning_rate": 7.849562909005425e-07, + "loss": 0.3596, + "step": 12500 + }, + { + "epoch": 1.50632149307646, + "grad_norm": 4.785493850708008, + "learning_rate": 7.845244219428469e-07, + "loss": 0.3888, + "step": 12510 + }, + { + "epoch": 1.5075255869957855, + "grad_norm": 4.297571182250977, + "learning_rate": 7.84092238836659e-07, + "loss": 0.3576, + "step": 12520 + }, + { + "epoch": 1.5087296809151114, + "grad_norm": 4.948373317718506, + "learning_rate": 7.836597420591595e-07, + "loss": 0.3766, + "step": 12530 + }, + { + "epoch": 1.5099337748344372, + "grad_norm": 4.502270698547363, + "learning_rate": 7.832269320878762e-07, + "loss": 0.3624, + "step": 12540 + }, + { + "epoch": 1.5111378687537629, + "grad_norm": 3.8894035816192627, + "learning_rate": 7.827938094006821e-07, + "loss": 0.3743, + "step": 12550 + }, + { + "epoch": 1.5123419626730885, + "grad_norm": 4.615447044372559, + "learning_rate": 7.823603744757956e-07, + "loss": 0.3586, + "step": 12560 + }, + { + "epoch": 1.5135460565924141, + "grad_norm": 4.9232401847839355, + "learning_rate": 7.8192662779178e-07, + "loss": 0.3488, + "step": 12570 + }, + { + "epoch": 1.5147501505117398, + "grad_norm": 4.241856575012207, + "learning_rate": 7.81492569827543e-07, + "loss": 0.355, + "step": 12580 + }, + { + "epoch": 1.5159542444310656, + "grad_norm": 5.041738986968994, + "learning_rate": 7.810582010623354e-07, + "loss": 0.3755, + "step": 12590 + }, + { + "epoch": 1.5171583383503915, + "grad_norm": 4.944552421569824, + "learning_rate": 7.806235219757518e-07, + "loss": 0.3643, + "step": 12600 + }, + { + "epoch": 1.5183624322697171, + "grad_norm": 5.554732799530029, + "learning_rate": 7.801885330477289e-07, + "loss": 0.3687, + "step": 12610 + }, + { + "epoch": 1.5195665261890428, + "grad_norm": 6.034419059753418, + "learning_rate": 7.797532347585459e-07, + "loss": 0.3595, + "step": 12620 + }, + { + "epoch": 1.5207706201083684, + "grad_norm": 4.2550048828125, + "learning_rate": 7.793176275888231e-07, + "loss": 0.3727, + "step": 12630 + }, + { + "epoch": 1.521974714027694, + "grad_norm": 4.084836006164551, + "learning_rate": 7.788817120195226e-07, + "loss": 0.3646, + "step": 12640 + }, + { + "epoch": 1.5231788079470199, + "grad_norm": 4.183859825134277, + "learning_rate": 7.784454885319464e-07, + "loss": 0.3846, + "step": 12650 + }, + { + "epoch": 1.5243829018663457, + "grad_norm": 4.216065406799316, + "learning_rate": 7.780089576077364e-07, + "loss": 0.3794, + "step": 12660 + }, + { + "epoch": 1.5255869957856714, + "grad_norm": 4.975666522979736, + "learning_rate": 7.775721197288744e-07, + "loss": 0.3903, + "step": 12670 + }, + { + "epoch": 1.526791089704997, + "grad_norm": 4.360125541687012, + "learning_rate": 7.77134975377681e-07, + "loss": 0.3481, + "step": 12680 + }, + { + "epoch": 1.5279951836243226, + "grad_norm": 5.113675594329834, + "learning_rate": 7.766975250368149e-07, + "loss": 0.3624, + "step": 12690 + }, + { + "epoch": 1.5291992775436483, + "grad_norm": 4.466128349304199, + "learning_rate": 7.76259769189273e-07, + "loss": 0.3619, + "step": 12700 + }, + { + "epoch": 1.5304033714629741, + "grad_norm": 4.945206165313721, + "learning_rate": 7.758217083183891e-07, + "loss": 0.358, + "step": 12710 + }, + { + "epoch": 1.5316074653823, + "grad_norm": 4.3737287521362305, + "learning_rate": 7.753833429078342e-07, + "loss": 0.3566, + "step": 12720 + }, + { + "epoch": 1.5328115593016256, + "grad_norm": 4.813685894012451, + "learning_rate": 7.749446734416152e-07, + "loss": 0.344, + "step": 12730 + }, + { + "epoch": 1.5340156532209512, + "grad_norm": 3.858191728591919, + "learning_rate": 7.745057004040751e-07, + "loss": 0.3461, + "step": 12740 + }, + { + "epoch": 1.5352197471402769, + "grad_norm": 4.396629333496094, + "learning_rate": 7.740664242798919e-07, + "loss": 0.3496, + "step": 12750 + }, + { + "epoch": 1.5364238410596025, + "grad_norm": 4.17794132232666, + "learning_rate": 7.73626845554078e-07, + "loss": 0.3584, + "step": 12760 + }, + { + "epoch": 1.5376279349789284, + "grad_norm": 6.110503673553467, + "learning_rate": 7.731869647119801e-07, + "loss": 0.3741, + "step": 12770 + }, + { + "epoch": 1.5388320288982542, + "grad_norm": 4.858775615692139, + "learning_rate": 7.727467822392787e-07, + "loss": 0.3489, + "step": 12780 + }, + { + "epoch": 1.5400361228175798, + "grad_norm": 4.899129390716553, + "learning_rate": 7.723062986219871e-07, + "loss": 0.3574, + "step": 12790 + }, + { + "epoch": 1.5412402167369055, + "grad_norm": 4.589954853057861, + "learning_rate": 7.718655143464508e-07, + "loss": 0.3697, + "step": 12800 + }, + { + "epoch": 1.542444310656231, + "grad_norm": 4.615177154541016, + "learning_rate": 7.71424429899348e-07, + "loss": 0.3574, + "step": 12810 + }, + { + "epoch": 1.5436484045755567, + "grad_norm": 5.081363201141357, + "learning_rate": 7.709830457676876e-07, + "loss": 0.3793, + "step": 12820 + }, + { + "epoch": 1.5448524984948826, + "grad_norm": 5.210774898529053, + "learning_rate": 7.7054136243881e-07, + "loss": 0.3562, + "step": 12830 + }, + { + "epoch": 1.5460565924142085, + "grad_norm": 4.458885192871094, + "learning_rate": 7.700993804003855e-07, + "loss": 0.3619, + "step": 12840 + }, + { + "epoch": 1.547260686333534, + "grad_norm": 4.320379734039307, + "learning_rate": 7.696571001404142e-07, + "loss": 0.3629, + "step": 12850 + }, + { + "epoch": 1.5484647802528597, + "grad_norm": 4.779387474060059, + "learning_rate": 7.692145221472258e-07, + "loss": 0.3633, + "step": 12860 + }, + { + "epoch": 1.5496688741721854, + "grad_norm": 4.924083709716797, + "learning_rate": 7.687716469094786e-07, + "loss": 0.3624, + "step": 12870 + }, + { + "epoch": 1.550872968091511, + "grad_norm": 5.194228649139404, + "learning_rate": 7.68328474916159e-07, + "loss": 0.3592, + "step": 12880 + }, + { + "epoch": 1.5520770620108368, + "grad_norm": 4.606070041656494, + "learning_rate": 7.67885006656581e-07, + "loss": 0.3686, + "step": 12890 + }, + { + "epoch": 1.5532811559301627, + "grad_norm": 4.206083297729492, + "learning_rate": 7.674412426203859e-07, + "loss": 0.3551, + "step": 12900 + }, + { + "epoch": 1.5544852498494883, + "grad_norm": 4.67086124420166, + "learning_rate": 7.669971832975416e-07, + "loss": 0.3569, + "step": 12910 + }, + { + "epoch": 1.555689343768814, + "grad_norm": 5.904470443725586, + "learning_rate": 7.665528291783417e-07, + "loss": 0.3407, + "step": 12920 + }, + { + "epoch": 1.5568934376881396, + "grad_norm": 4.242117404937744, + "learning_rate": 7.661081807534058e-07, + "loss": 0.3422, + "step": 12930 + }, + { + "epoch": 1.5580975316074652, + "grad_norm": 4.790373802185059, + "learning_rate": 7.656632385136778e-07, + "loss": 0.3573, + "step": 12940 + }, + { + "epoch": 1.559301625526791, + "grad_norm": 4.904318809509277, + "learning_rate": 7.652180029504268e-07, + "loss": 0.3606, + "step": 12950 + }, + { + "epoch": 1.560505719446117, + "grad_norm": 4.863579750061035, + "learning_rate": 7.64772474555245e-07, + "loss": 0.361, + "step": 12960 + }, + { + "epoch": 1.5617098133654426, + "grad_norm": 5.459078311920166, + "learning_rate": 7.643266538200483e-07, + "loss": 0.3577, + "step": 12970 + }, + { + "epoch": 1.5629139072847682, + "grad_norm": 5.426388740539551, + "learning_rate": 7.638805412370755e-07, + "loss": 0.3725, + "step": 12980 + }, + { + "epoch": 1.5641180012040938, + "grad_norm": 4.903288841247559, + "learning_rate": 7.634341372988872e-07, + "loss": 0.3562, + "step": 12990 + }, + { + "epoch": 1.5653220951234195, + "grad_norm": 4.128101825714111, + "learning_rate": 7.629874424983664e-07, + "loss": 0.3405, + "step": 13000 + }, + { + "epoch": 1.5665261890427453, + "grad_norm": 4.6488213539123535, + "learning_rate": 7.625404573287163e-07, + "loss": 0.3731, + "step": 13010 + }, + { + "epoch": 1.5677302829620712, + "grad_norm": 4.610156059265137, + "learning_rate": 7.620931822834614e-07, + "loss": 0.3575, + "step": 13020 + }, + { + "epoch": 1.5689343768813968, + "grad_norm": 5.422335147857666, + "learning_rate": 7.616456178564462e-07, + "loss": 0.3833, + "step": 13030 + }, + { + "epoch": 1.5701384708007224, + "grad_norm": 4.844593048095703, + "learning_rate": 7.611977645418343e-07, + "loss": 0.3647, + "step": 13040 + }, + { + "epoch": 1.571342564720048, + "grad_norm": 4.274131774902344, + "learning_rate": 7.607496228341088e-07, + "loss": 0.3542, + "step": 13050 + }, + { + "epoch": 1.5725466586393737, + "grad_norm": 4.641569137573242, + "learning_rate": 7.60301193228071e-07, + "loss": 0.3704, + "step": 13060 + }, + { + "epoch": 1.5737507525586996, + "grad_norm": 4.771531105041504, + "learning_rate": 7.598524762188395e-07, + "loss": 0.3529, + "step": 13070 + }, + { + "epoch": 1.5749548464780254, + "grad_norm": 5.63432502746582, + "learning_rate": 7.594034723018514e-07, + "loss": 0.3554, + "step": 13080 + }, + { + "epoch": 1.576158940397351, + "grad_norm": 3.5664002895355225, + "learning_rate": 7.589541819728596e-07, + "loss": 0.3617, + "step": 13090 + }, + { + "epoch": 1.5773630343166767, + "grad_norm": 4.43233060836792, + "learning_rate": 7.585046057279337e-07, + "loss": 0.3795, + "step": 13100 + }, + { + "epoch": 1.5785671282360023, + "grad_norm": 4.293588638305664, + "learning_rate": 7.580547440634587e-07, + "loss": 0.3361, + "step": 13110 + }, + { + "epoch": 1.5797712221553282, + "grad_norm": 4.606287479400635, + "learning_rate": 7.576045974761351e-07, + "loss": 0.3573, + "step": 13120 + }, + { + "epoch": 1.5809753160746538, + "grad_norm": 4.9702558517456055, + "learning_rate": 7.571541664629775e-07, + "loss": 0.3718, + "step": 13130 + }, + { + "epoch": 1.5821794099939797, + "grad_norm": 4.685069561004639, + "learning_rate": 7.567034515213151e-07, + "loss": 0.3704, + "step": 13140 + }, + { + "epoch": 1.5833835039133053, + "grad_norm": 4.804528713226318, + "learning_rate": 7.562524531487902e-07, + "loss": 0.3511, + "step": 13150 + }, + { + "epoch": 1.584587597832631, + "grad_norm": 5.332268714904785, + "learning_rate": 7.558011718433582e-07, + "loss": 0.3573, + "step": 13160 + }, + { + "epoch": 1.5857916917519566, + "grad_norm": 4.4862284660339355, + "learning_rate": 7.553496081032867e-07, + "loss": 0.3423, + "step": 13170 + }, + { + "epoch": 1.5869957856712824, + "grad_norm": 4.632198810577393, + "learning_rate": 7.548977624271555e-07, + "loss": 0.3719, + "step": 13180 + }, + { + "epoch": 1.588199879590608, + "grad_norm": 4.4371137619018555, + "learning_rate": 7.544456353138553e-07, + "loss": 0.3515, + "step": 13190 + }, + { + "epoch": 1.589403973509934, + "grad_norm": 4.162461757659912, + "learning_rate": 7.539932272625879e-07, + "loss": 0.363, + "step": 13200 + }, + { + "epoch": 1.5906080674292595, + "grad_norm": 4.980907917022705, + "learning_rate": 7.535405387728648e-07, + "loss": 0.362, + "step": 13210 + }, + { + "epoch": 1.5918121613485852, + "grad_norm": 4.321689128875732, + "learning_rate": 7.530875703445077e-07, + "loss": 0.3441, + "step": 13220 + }, + { + "epoch": 1.5930162552679108, + "grad_norm": 4.930966854095459, + "learning_rate": 7.526343224776471e-07, + "loss": 0.3505, + "step": 13230 + }, + { + "epoch": 1.5942203491872367, + "grad_norm": 4.267889499664307, + "learning_rate": 7.52180795672722e-07, + "loss": 0.3678, + "step": 13240 + }, + { + "epoch": 1.5954244431065623, + "grad_norm": 3.8834383487701416, + "learning_rate": 7.517269904304794e-07, + "loss": 0.3648, + "step": 13250 + }, + { + "epoch": 1.5966285370258881, + "grad_norm": 4.397730827331543, + "learning_rate": 7.512729072519739e-07, + "loss": 0.3601, + "step": 13260 + }, + { + "epoch": 1.5978326309452138, + "grad_norm": 4.559187889099121, + "learning_rate": 7.508185466385666e-07, + "loss": 0.3508, + "step": 13270 + }, + { + "epoch": 1.5990367248645394, + "grad_norm": 4.514613628387451, + "learning_rate": 7.503639090919255e-07, + "loss": 0.3578, + "step": 13280 + }, + { + "epoch": 1.600240818783865, + "grad_norm": 4.5233073234558105, + "learning_rate": 7.499089951140237e-07, + "loss": 0.3516, + "step": 13290 + }, + { + "epoch": 1.601444912703191, + "grad_norm": 4.616694450378418, + "learning_rate": 7.494538052071402e-07, + "loss": 0.3616, + "step": 13300 + }, + { + "epoch": 1.6026490066225165, + "grad_norm": 4.6488518714904785, + "learning_rate": 7.489983398738579e-07, + "loss": 0.3582, + "step": 13310 + }, + { + "epoch": 1.6038531005418424, + "grad_norm": 4.645969390869141, + "learning_rate": 7.485425996170644e-07, + "loss": 0.3548, + "step": 13320 + }, + { + "epoch": 1.605057194461168, + "grad_norm": 5.864965438842773, + "learning_rate": 7.480865849399507e-07, + "loss": 0.3587, + "step": 13330 + }, + { + "epoch": 1.6062612883804936, + "grad_norm": 4.283803939819336, + "learning_rate": 7.476302963460108e-07, + "loss": 0.3626, + "step": 13340 + }, + { + "epoch": 1.6074653822998193, + "grad_norm": 4.545533657073975, + "learning_rate": 7.47173734339041e-07, + "loss": 0.3526, + "step": 13350 + }, + { + "epoch": 1.6086694762191451, + "grad_norm": 4.885293483734131, + "learning_rate": 7.467168994231393e-07, + "loss": 0.3685, + "step": 13360 + }, + { + "epoch": 1.6098735701384708, + "grad_norm": 4.112198829650879, + "learning_rate": 7.462597921027056e-07, + "loss": 0.3727, + "step": 13370 + }, + { + "epoch": 1.6110776640577966, + "grad_norm": 4.272058963775635, + "learning_rate": 7.458024128824403e-07, + "loss": 0.3567, + "step": 13380 + }, + { + "epoch": 1.6122817579771223, + "grad_norm": 4.891336441040039, + "learning_rate": 7.453447622673438e-07, + "loss": 0.3566, + "step": 13390 + }, + { + "epoch": 1.6134858518964479, + "grad_norm": 5.003636837005615, + "learning_rate": 7.448868407627163e-07, + "loss": 0.3717, + "step": 13400 + }, + { + "epoch": 1.6146899458157735, + "grad_norm": 3.9844002723693848, + "learning_rate": 7.444286488741571e-07, + "loss": 0.3537, + "step": 13410 + }, + { + "epoch": 1.6158940397350994, + "grad_norm": 4.326488018035889, + "learning_rate": 7.439701871075641e-07, + "loss": 0.3353, + "step": 13420 + }, + { + "epoch": 1.617098133654425, + "grad_norm": 4.168161392211914, + "learning_rate": 7.435114559691333e-07, + "loss": 0.3506, + "step": 13430 + }, + { + "epoch": 1.6183022275737509, + "grad_norm": 5.062152862548828, + "learning_rate": 7.430524559653575e-07, + "loss": 0.3536, + "step": 13440 + }, + { + "epoch": 1.6195063214930765, + "grad_norm": 5.29563570022583, + "learning_rate": 7.425931876030272e-07, + "loss": 0.359, + "step": 13450 + }, + { + "epoch": 1.6207104154124021, + "grad_norm": 4.655216693878174, + "learning_rate": 7.421336513892284e-07, + "loss": 0.3459, + "step": 13460 + }, + { + "epoch": 1.6219145093317278, + "grad_norm": 4.558264255523682, + "learning_rate": 7.416738478313438e-07, + "loss": 0.3603, + "step": 13470 + }, + { + "epoch": 1.6231186032510536, + "grad_norm": 4.36596155166626, + "learning_rate": 7.412137774370501e-07, + "loss": 0.3632, + "step": 13480 + }, + { + "epoch": 1.6243226971703792, + "grad_norm": 4.248297214508057, + "learning_rate": 7.407534407143198e-07, + "loss": 0.3575, + "step": 13490 + }, + { + "epoch": 1.625526791089705, + "grad_norm": 4.935293197631836, + "learning_rate": 7.402928381714184e-07, + "loss": 0.3583, + "step": 13500 + }, + { + "epoch": 1.6267308850090307, + "grad_norm": 4.29832649230957, + "learning_rate": 7.398319703169057e-07, + "loss": 0.3593, + "step": 13510 + }, + { + "epoch": 1.6279349789283564, + "grad_norm": 4.707507610321045, + "learning_rate": 7.39370837659634e-07, + "loss": 0.3486, + "step": 13520 + }, + { + "epoch": 1.629139072847682, + "grad_norm": 4.7867326736450195, + "learning_rate": 7.389094407087481e-07, + "loss": 0.3708, + "step": 13530 + }, + { + "epoch": 1.6303431667670079, + "grad_norm": 5.004173755645752, + "learning_rate": 7.384477799736847e-07, + "loss": 0.3693, + "step": 13540 + }, + { + "epoch": 1.6315472606863335, + "grad_norm": 4.378966331481934, + "learning_rate": 7.379858559641716e-07, + "loss": 0.3792, + "step": 13550 + }, + { + "epoch": 1.6327513546056593, + "grad_norm": 4.35708475112915, + "learning_rate": 7.375236691902272e-07, + "loss": 0.357, + "step": 13560 + }, + { + "epoch": 1.633955448524985, + "grad_norm": 4.158879280090332, + "learning_rate": 7.370612201621606e-07, + "loss": 0.3705, + "step": 13570 + }, + { + "epoch": 1.6351595424443106, + "grad_norm": 4.620648384094238, + "learning_rate": 7.365985093905693e-07, + "loss": 0.3288, + "step": 13580 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 4.588129997253418, + "learning_rate": 7.361355373863413e-07, + "loss": 0.3545, + "step": 13590 + }, + { + "epoch": 1.637567730282962, + "grad_norm": 4.273639678955078, + "learning_rate": 7.356723046606517e-07, + "loss": 0.3597, + "step": 13600 + }, + { + "epoch": 1.6387718242022877, + "grad_norm": 4.793459415435791, + "learning_rate": 7.352088117249644e-07, + "loss": 0.3532, + "step": 13610 + }, + { + "epoch": 1.6399759181216136, + "grad_norm": 4.27385950088501, + "learning_rate": 7.347450590910299e-07, + "loss": 0.3787, + "step": 13620 + }, + { + "epoch": 1.6411800120409392, + "grad_norm": 4.229093551635742, + "learning_rate": 7.34281047270886e-07, + "loss": 0.3592, + "step": 13630 + }, + { + "epoch": 1.6423841059602649, + "grad_norm": 4.402678489685059, + "learning_rate": 7.338167767768564e-07, + "loss": 0.3612, + "step": 13640 + }, + { + "epoch": 1.6435881998795905, + "grad_norm": 4.09978723526001, + "learning_rate": 7.333522481215503e-07, + "loss": 0.3571, + "step": 13650 + }, + { + "epoch": 1.6447922937989163, + "grad_norm": 4.659477710723877, + "learning_rate": 7.32887461817862e-07, + "loss": 0.3725, + "step": 13660 + }, + { + "epoch": 1.645996387718242, + "grad_norm": 4.500072002410889, + "learning_rate": 7.324224183789707e-07, + "loss": 0.3458, + "step": 13670 + }, + { + "epoch": 1.6472004816375678, + "grad_norm": 5.1016526222229, + "learning_rate": 7.319571183183388e-07, + "loss": 0.3734, + "step": 13680 + }, + { + "epoch": 1.6484045755568935, + "grad_norm": 4.819193363189697, + "learning_rate": 7.314915621497129e-07, + "loss": 0.3601, + "step": 13690 + }, + { + "epoch": 1.649608669476219, + "grad_norm": 4.4075026512146, + "learning_rate": 7.310257503871214e-07, + "loss": 0.3556, + "step": 13700 + }, + { + "epoch": 1.6508127633955447, + "grad_norm": 4.471024036407471, + "learning_rate": 7.305596835448753e-07, + "loss": 0.3625, + "step": 13710 + }, + { + "epoch": 1.6520168573148706, + "grad_norm": 4.29016637802124, + "learning_rate": 7.300933621375676e-07, + "loss": 0.3619, + "step": 13720 + }, + { + "epoch": 1.6532209512341962, + "grad_norm": 4.514208793640137, + "learning_rate": 7.296267866800722e-07, + "loss": 0.3622, + "step": 13730 + }, + { + "epoch": 1.654425045153522, + "grad_norm": 4.275468826293945, + "learning_rate": 7.291599576875432e-07, + "loss": 0.3667, + "step": 13740 + }, + { + "epoch": 1.6556291390728477, + "grad_norm": 4.0805559158325195, + "learning_rate": 7.286928756754148e-07, + "loss": 0.371, + "step": 13750 + }, + { + "epoch": 1.6568332329921733, + "grad_norm": 4.84345006942749, + "learning_rate": 7.282255411594006e-07, + "loss": 0.3696, + "step": 13760 + }, + { + "epoch": 1.658037326911499, + "grad_norm": 4.703734874725342, + "learning_rate": 7.277579546554931e-07, + "loss": 0.3673, + "step": 13770 + }, + { + "epoch": 1.6592414208308248, + "grad_norm": 4.18894624710083, + "learning_rate": 7.272901166799627e-07, + "loss": 0.3365, + "step": 13780 + }, + { + "epoch": 1.6604455147501507, + "grad_norm": 4.9901204109191895, + "learning_rate": 7.268220277493578e-07, + "loss": 0.3588, + "step": 13790 + }, + { + "epoch": 1.6616496086694763, + "grad_norm": 4.896132946014404, + "learning_rate": 7.263536883805039e-07, + "loss": 0.3659, + "step": 13800 + }, + { + "epoch": 1.662853702588802, + "grad_norm": 4.311833381652832, + "learning_rate": 7.258850990905025e-07, + "loss": 0.3707, + "step": 13810 + }, + { + "epoch": 1.6640577965081276, + "grad_norm": 4.157628059387207, + "learning_rate": 7.254162603967317e-07, + "loss": 0.3498, + "step": 13820 + }, + { + "epoch": 1.6652618904274532, + "grad_norm": 5.240469932556152, + "learning_rate": 7.249471728168443e-07, + "loss": 0.3559, + "step": 13830 + }, + { + "epoch": 1.666465984346779, + "grad_norm": 4.077708721160889, + "learning_rate": 7.244778368687687e-07, + "loss": 0.3745, + "step": 13840 + }, + { + "epoch": 1.667670078266105, + "grad_norm": 4.9550395011901855, + "learning_rate": 7.240082530707069e-07, + "loss": 0.3563, + "step": 13850 + }, + { + "epoch": 1.6688741721854305, + "grad_norm": 5.530270576477051, + "learning_rate": 7.235384219411348e-07, + "loss": 0.3764, + "step": 13860 + }, + { + "epoch": 1.6700782661047562, + "grad_norm": 4.50790548324585, + "learning_rate": 7.230683439988012e-07, + "loss": 0.3471, + "step": 13870 + }, + { + "epoch": 1.6712823600240818, + "grad_norm": 4.373943328857422, + "learning_rate": 7.225980197627277e-07, + "loss": 0.3601, + "step": 13880 + }, + { + "epoch": 1.6724864539434074, + "grad_norm": 3.9449055194854736, + "learning_rate": 7.221274497522076e-07, + "loss": 0.3533, + "step": 13890 + }, + { + "epoch": 1.6736905478627333, + "grad_norm": 4.625890254974365, + "learning_rate": 7.216566344868058e-07, + "loss": 0.3771, + "step": 13900 + }, + { + "epoch": 1.6748946417820592, + "grad_norm": 4.7843475341796875, + "learning_rate": 7.211855744863577e-07, + "loss": 0.3477, + "step": 13910 + }, + { + "epoch": 1.6760987357013848, + "grad_norm": 4.275618076324463, + "learning_rate": 7.207142702709688e-07, + "loss": 0.3452, + "step": 13920 + }, + { + "epoch": 1.6773028296207104, + "grad_norm": 5.26132869720459, + "learning_rate": 7.202427223610152e-07, + "loss": 0.3568, + "step": 13930 + }, + { + "epoch": 1.678506923540036, + "grad_norm": 4.528031826019287, + "learning_rate": 7.197709312771406e-07, + "loss": 0.347, + "step": 13940 + }, + { + "epoch": 1.6797110174593617, + "grad_norm": 4.68961763381958, + "learning_rate": 7.192988975402583e-07, + "loss": 0.3687, + "step": 13950 + }, + { + "epoch": 1.6809151113786875, + "grad_norm": 4.3820719718933105, + "learning_rate": 7.188266216715493e-07, + "loss": 0.3572, + "step": 13960 + }, + { + "epoch": 1.6821192052980134, + "grad_norm": 3.974177598953247, + "learning_rate": 7.183541041924616e-07, + "loss": 0.34, + "step": 13970 + }, + { + "epoch": 1.683323299217339, + "grad_norm": 4.8562331199646, + "learning_rate": 7.178813456247102e-07, + "loss": 0.3532, + "step": 13980 + }, + { + "epoch": 1.6845273931366647, + "grad_norm": 3.9439549446105957, + "learning_rate": 7.174083464902763e-07, + "loss": 0.3459, + "step": 13990 + }, + { + "epoch": 1.6857314870559903, + "grad_norm": 4.226308345794678, + "learning_rate": 7.16935107311407e-07, + "loss": 0.3352, + "step": 14000 + }, + { + "epoch": 1.686935580975316, + "grad_norm": 4.850135326385498, + "learning_rate": 7.164616286106135e-07, + "loss": 0.3661, + "step": 14010 + }, + { + "epoch": 1.6881396748946418, + "grad_norm": 4.845891952514648, + "learning_rate": 7.159879109106725e-07, + "loss": 0.3868, + "step": 14020 + }, + { + "epoch": 1.6893437688139676, + "grad_norm": 5.063507556915283, + "learning_rate": 7.155139547346242e-07, + "loss": 0.3628, + "step": 14030 + }, + { + "epoch": 1.6905478627332933, + "grad_norm": 4.6817216873168945, + "learning_rate": 7.15039760605772e-07, + "loss": 0.3744, + "step": 14040 + }, + { + "epoch": 1.691751956652619, + "grad_norm": 4.315075874328613, + "learning_rate": 7.145653290476819e-07, + "loss": 0.3613, + "step": 14050 + }, + { + "epoch": 1.6929560505719445, + "grad_norm": 4.234760284423828, + "learning_rate": 7.140906605841825e-07, + "loss": 0.3733, + "step": 14060 + }, + { + "epoch": 1.6941601444912702, + "grad_norm": 5.843511581420898, + "learning_rate": 7.136157557393637e-07, + "loss": 0.3443, + "step": 14070 + }, + { + "epoch": 1.695364238410596, + "grad_norm": 4.704221248626709, + "learning_rate": 7.131406150375762e-07, + "loss": 0.3384, + "step": 14080 + }, + { + "epoch": 1.6965683323299219, + "grad_norm": 4.1078200340271, + "learning_rate": 7.126652390034316e-07, + "loss": 0.3554, + "step": 14090 + }, + { + "epoch": 1.6977724262492475, + "grad_norm": 4.6124773025512695, + "learning_rate": 7.12189628161801e-07, + "loss": 0.3323, + "step": 14100 + }, + { + "epoch": 1.6989765201685731, + "grad_norm": 3.9569902420043945, + "learning_rate": 7.117137830378146e-07, + "loss": 0.3581, + "step": 14110 + }, + { + "epoch": 1.7001806140878988, + "grad_norm": 4.327024459838867, + "learning_rate": 7.112377041568617e-07, + "loss": 0.3605, + "step": 14120 + }, + { + "epoch": 1.7013847080072244, + "grad_norm": 4.041974067687988, + "learning_rate": 7.107613920445895e-07, + "loss": 0.3514, + "step": 14130 + }, + { + "epoch": 1.7025888019265503, + "grad_norm": 4.295658588409424, + "learning_rate": 7.102848472269026e-07, + "loss": 0.3489, + "step": 14140 + }, + { + "epoch": 1.7037928958458761, + "grad_norm": 4.117722988128662, + "learning_rate": 7.098080702299628e-07, + "loss": 0.3382, + "step": 14150 + }, + { + "epoch": 1.7049969897652018, + "grad_norm": 5.249290943145752, + "learning_rate": 7.093310615801879e-07, + "loss": 0.3696, + "step": 14160 + }, + { + "epoch": 1.7062010836845274, + "grad_norm": 3.8647286891937256, + "learning_rate": 7.088538218042518e-07, + "loss": 0.3403, + "step": 14170 + }, + { + "epoch": 1.707405177603853, + "grad_norm": 4.454891204833984, + "learning_rate": 7.083763514290834e-07, + "loss": 0.3743, + "step": 14180 + }, + { + "epoch": 1.7086092715231787, + "grad_norm": 4.183931827545166, + "learning_rate": 7.078986509818662e-07, + "loss": 0.3493, + "step": 14190 + }, + { + "epoch": 1.7098133654425045, + "grad_norm": 3.9510889053344727, + "learning_rate": 7.074207209900379e-07, + "loss": 0.3469, + "step": 14200 + }, + { + "epoch": 1.7110174593618304, + "grad_norm": 4.839264869689941, + "learning_rate": 7.069425619812896e-07, + "loss": 0.3444, + "step": 14210 + }, + { + "epoch": 1.712221553281156, + "grad_norm": 4.237350940704346, + "learning_rate": 7.064641744835649e-07, + "loss": 0.3474, + "step": 14220 + }, + { + "epoch": 1.7134256472004816, + "grad_norm": 4.17114782333374, + "learning_rate": 7.059855590250603e-07, + "loss": 0.3465, + "step": 14230 + }, + { + "epoch": 1.7146297411198073, + "grad_norm": 4.114003658294678, + "learning_rate": 7.055067161342233e-07, + "loss": 0.3674, + "step": 14240 + }, + { + "epoch": 1.715833835039133, + "grad_norm": 4.886813640594482, + "learning_rate": 7.050276463397533e-07, + "loss": 0.3848, + "step": 14250 + }, + { + "epoch": 1.7170379289584587, + "grad_norm": 4.069955348968506, + "learning_rate": 7.045483501705996e-07, + "loss": 0.3493, + "step": 14260 + }, + { + "epoch": 1.7182420228777846, + "grad_norm": 4.502857685089111, + "learning_rate": 7.040688281559617e-07, + "loss": 0.3548, + "step": 14270 + }, + { + "epoch": 1.7194461167971102, + "grad_norm": 4.283501148223877, + "learning_rate": 7.035890808252884e-07, + "loss": 0.3571, + "step": 14280 + }, + { + "epoch": 1.7206502107164359, + "grad_norm": 4.563022136688232, + "learning_rate": 7.031091087082772e-07, + "loss": 0.3485, + "step": 14290 + }, + { + "epoch": 1.7218543046357615, + "grad_norm": 4.165189266204834, + "learning_rate": 7.02628912334874e-07, + "loss": 0.3417, + "step": 14300 + }, + { + "epoch": 1.7230583985550871, + "grad_norm": 4.657063961029053, + "learning_rate": 7.021484922352721e-07, + "loss": 0.3611, + "step": 14310 + }, + { + "epoch": 1.724262492474413, + "grad_norm": 6.094346046447754, + "learning_rate": 7.016678489399121e-07, + "loss": 0.3371, + "step": 14320 + }, + { + "epoch": 1.7254665863937388, + "grad_norm": 4.576262474060059, + "learning_rate": 7.011869829794806e-07, + "loss": 0.3624, + "step": 14330 + }, + { + "epoch": 1.7266706803130645, + "grad_norm": 5.231967449188232, + "learning_rate": 7.007058948849105e-07, + "loss": 0.3745, + "step": 14340 + }, + { + "epoch": 1.72787477423239, + "grad_norm": 4.39863395690918, + "learning_rate": 7.002245851873794e-07, + "loss": 0.3545, + "step": 14350 + }, + { + "epoch": 1.7290788681517157, + "grad_norm": 4.428983211517334, + "learning_rate": 6.997430544183103e-07, + "loss": 0.3534, + "step": 14360 + }, + { + "epoch": 1.7302829620710414, + "grad_norm": 5.451033115386963, + "learning_rate": 6.992613031093698e-07, + "loss": 0.3584, + "step": 14370 + }, + { + "epoch": 1.7314870559903672, + "grad_norm": 4.715031147003174, + "learning_rate": 6.987793317924682e-07, + "loss": 0.3643, + "step": 14380 + }, + { + "epoch": 1.732691149909693, + "grad_norm": 4.199245452880859, + "learning_rate": 6.982971409997583e-07, + "loss": 0.3539, + "step": 14390 + }, + { + "epoch": 1.7338952438290187, + "grad_norm": 5.606119632720947, + "learning_rate": 6.97814731263636e-07, + "loss": 0.3613, + "step": 14400 + }, + { + "epoch": 1.7350993377483444, + "grad_norm": 5.036284923553467, + "learning_rate": 6.973321031167382e-07, + "loss": 0.3679, + "step": 14410 + }, + { + "epoch": 1.73630343166767, + "grad_norm": 4.951879978179932, + "learning_rate": 6.968492570919434e-07, + "loss": 0.3572, + "step": 14420 + }, + { + "epoch": 1.7375075255869958, + "grad_norm": 4.428969860076904, + "learning_rate": 6.963661937223703e-07, + "loss": 0.3538, + "step": 14430 + }, + { + "epoch": 1.7387116195063215, + "grad_norm": 3.7024569511413574, + "learning_rate": 6.958829135413782e-07, + "loss": 0.3644, + "step": 14440 + }, + { + "epoch": 1.7399157134256473, + "grad_norm": 4.4168381690979, + "learning_rate": 6.95399417082565e-07, + "loss": 0.3498, + "step": 14450 + }, + { + "epoch": 1.741119807344973, + "grad_norm": 4.818751335144043, + "learning_rate": 6.949157048797678e-07, + "loss": 0.3726, + "step": 14460 + }, + { + "epoch": 1.7423239012642986, + "grad_norm": 5.769382953643799, + "learning_rate": 6.944317774670622e-07, + "loss": 0.3517, + "step": 14470 + }, + { + "epoch": 1.7435279951836242, + "grad_norm": 4.914524078369141, + "learning_rate": 6.939476353787607e-07, + "loss": 0.349, + "step": 14480 + }, + { + "epoch": 1.74473208910295, + "grad_norm": 4.6800456047058105, + "learning_rate": 6.934632791494134e-07, + "loss": 0.3725, + "step": 14490 + }, + { + "epoch": 1.7459361830222757, + "grad_norm": 4.627834796905518, + "learning_rate": 6.929787093138067e-07, + "loss": 0.359, + "step": 14500 + }, + { + "epoch": 1.7471402769416016, + "grad_norm": 5.098109245300293, + "learning_rate": 6.924939264069626e-07, + "loss": 0.3502, + "step": 14510 + }, + { + "epoch": 1.7483443708609272, + "grad_norm": 4.18192720413208, + "learning_rate": 6.920089309641388e-07, + "loss": 0.3448, + "step": 14520 + }, + { + "epoch": 1.7495484647802528, + "grad_norm": 4.4052815437316895, + "learning_rate": 6.915237235208274e-07, + "loss": 0.3459, + "step": 14530 + }, + { + "epoch": 1.7507525586995785, + "grad_norm": 5.557136058807373, + "learning_rate": 6.910383046127544e-07, + "loss": 0.355, + "step": 14540 + }, + { + "epoch": 1.7519566526189043, + "grad_norm": 5.7654128074646, + "learning_rate": 6.905526747758796e-07, + "loss": 0.3624, + "step": 14550 + }, + { + "epoch": 1.75316074653823, + "grad_norm": 5.040695667266846, + "learning_rate": 6.900668345463957e-07, + "loss": 0.3513, + "step": 14560 + }, + { + "epoch": 1.7543648404575558, + "grad_norm": 4.529175758361816, + "learning_rate": 6.895807844607274e-07, + "loss": 0.348, + "step": 14570 + }, + { + "epoch": 1.7555689343768814, + "grad_norm": 4.473850727081299, + "learning_rate": 6.890945250555312e-07, + "loss": 0.3708, + "step": 14580 + }, + { + "epoch": 1.756773028296207, + "grad_norm": 4.2242865562438965, + "learning_rate": 6.88608056867695e-07, + "loss": 0.3536, + "step": 14590 + }, + { + "epoch": 1.7579771222155327, + "grad_norm": 4.953219413757324, + "learning_rate": 6.881213804343369e-07, + "loss": 0.3564, + "step": 14600 + }, + { + "epoch": 1.7591812161348586, + "grad_norm": 4.626575469970703, + "learning_rate": 6.876344962928051e-07, + "loss": 0.3624, + "step": 14610 + }, + { + "epoch": 1.7603853100541842, + "grad_norm": 5.615645408630371, + "learning_rate": 6.87147404980677e-07, + "loss": 0.3711, + "step": 14620 + }, + { + "epoch": 1.76158940397351, + "grad_norm": 4.350038051605225, + "learning_rate": 6.866601070357587e-07, + "loss": 0.3517, + "step": 14630 + }, + { + "epoch": 1.7627934978928357, + "grad_norm": 4.5289387702941895, + "learning_rate": 6.861726029960849e-07, + "loss": 0.3602, + "step": 14640 + }, + { + "epoch": 1.7639975918121613, + "grad_norm": 5.127388954162598, + "learning_rate": 6.856848933999173e-07, + "loss": 0.345, + "step": 14650 + }, + { + "epoch": 1.765201685731487, + "grad_norm": 4.675601482391357, + "learning_rate": 6.851969787857447e-07, + "loss": 0.3484, + "step": 14660 + }, + { + "epoch": 1.7664057796508128, + "grad_norm": 3.9305527210235596, + "learning_rate": 6.847088596922824e-07, + "loss": 0.3478, + "step": 14670 + }, + { + "epoch": 1.7676098735701384, + "grad_norm": 4.547889709472656, + "learning_rate": 6.842205366584715e-07, + "loss": 0.3627, + "step": 14680 + }, + { + "epoch": 1.7688139674894643, + "grad_norm": 5.042651653289795, + "learning_rate": 6.837320102234781e-07, + "loss": 0.3595, + "step": 14690 + }, + { + "epoch": 1.77001806140879, + "grad_norm": 4.645577907562256, + "learning_rate": 6.832432809266928e-07, + "loss": 0.3417, + "step": 14700 + }, + { + "epoch": 1.7712221553281156, + "grad_norm": 5.52669095993042, + "learning_rate": 6.827543493077306e-07, + "loss": 0.352, + "step": 14710 + }, + { + "epoch": 1.7724262492474412, + "grad_norm": 4.48500394821167, + "learning_rate": 6.822652159064293e-07, + "loss": 0.3427, + "step": 14720 + }, + { + "epoch": 1.773630343166767, + "grad_norm": 4.676848411560059, + "learning_rate": 6.817758812628503e-07, + "loss": 0.3568, + "step": 14730 + }, + { + "epoch": 1.7748344370860927, + "grad_norm": 4.112384796142578, + "learning_rate": 6.812863459172764e-07, + "loss": 0.3626, + "step": 14740 + }, + { + "epoch": 1.7760385310054185, + "grad_norm": 4.3355326652526855, + "learning_rate": 6.807966104102122e-07, + "loss": 0.3408, + "step": 14750 + }, + { + "epoch": 1.7772426249247442, + "grad_norm": 4.12075138092041, + "learning_rate": 6.803066752823837e-07, + "loss": 0.3516, + "step": 14760 + }, + { + "epoch": 1.7784467188440698, + "grad_norm": 4.14115571975708, + "learning_rate": 6.79816541074737e-07, + "loss": 0.3442, + "step": 14770 + }, + { + "epoch": 1.7796508127633954, + "grad_norm": 4.440965175628662, + "learning_rate": 6.793262083284377e-07, + "loss": 0.348, + "step": 14780 + }, + { + "epoch": 1.7808549066827213, + "grad_norm": 4.727054595947266, + "learning_rate": 6.788356775848712e-07, + "loss": 0.3545, + "step": 14790 + }, + { + "epoch": 1.782059000602047, + "grad_norm": 4.421995639801025, + "learning_rate": 6.783449493856411e-07, + "loss": 0.3584, + "step": 14800 + }, + { + "epoch": 1.7832630945213728, + "grad_norm": 4.619497776031494, + "learning_rate": 6.778540242725695e-07, + "loss": 0.3621, + "step": 14810 + }, + { + "epoch": 1.7844671884406984, + "grad_norm": 4.975179672241211, + "learning_rate": 6.773629027876952e-07, + "loss": 0.3433, + "step": 14820 + }, + { + "epoch": 1.785671282360024, + "grad_norm": 4.3249030113220215, + "learning_rate": 6.768715854732743e-07, + "loss": 0.362, + "step": 14830 + }, + { + "epoch": 1.7868753762793497, + "grad_norm": 4.467803001403809, + "learning_rate": 6.763800728717792e-07, + "loss": 0.3589, + "step": 14840 + }, + { + "epoch": 1.7880794701986755, + "grad_norm": 5.496029376983643, + "learning_rate": 6.758883655258976e-07, + "loss": 0.3395, + "step": 14850 + }, + { + "epoch": 1.7892835641180012, + "grad_norm": 4.524773120880127, + "learning_rate": 6.753964639785321e-07, + "loss": 0.3544, + "step": 14860 + }, + { + "epoch": 1.790487658037327, + "grad_norm": 4.625549793243408, + "learning_rate": 6.749043687728005e-07, + "loss": 0.3721, + "step": 14870 + }, + { + "epoch": 1.7916917519566526, + "grad_norm": 5.1430230140686035, + "learning_rate": 6.744120804520335e-07, + "loss": 0.3516, + "step": 14880 + }, + { + "epoch": 1.7928958458759783, + "grad_norm": 5.0784173011779785, + "learning_rate": 6.739195995597757e-07, + "loss": 0.3579, + "step": 14890 + }, + { + "epoch": 1.794099939795304, + "grad_norm": 4.529468536376953, + "learning_rate": 6.734269266397836e-07, + "loss": 0.3573, + "step": 14900 + }, + { + "epoch": 1.7953040337146298, + "grad_norm": 4.950248718261719, + "learning_rate": 6.729340622360267e-07, + "loss": 0.3615, + "step": 14910 + }, + { + "epoch": 1.7965081276339554, + "grad_norm": 3.968449831008911, + "learning_rate": 6.724410068926852e-07, + "loss": 0.3361, + "step": 14920 + }, + { + "epoch": 1.7977122215532813, + "grad_norm": 4.806743144989014, + "learning_rate": 6.7194776115415e-07, + "loss": 0.3497, + "step": 14930 + }, + { + "epoch": 1.7989163154726069, + "grad_norm": 4.263092517852783, + "learning_rate": 6.714543255650229e-07, + "loss": 0.3659, + "step": 14940 + }, + { + "epoch": 1.8001204093919325, + "grad_norm": 4.752941131591797, + "learning_rate": 6.709607006701148e-07, + "loss": 0.3363, + "step": 14950 + }, + { + "epoch": 1.8013245033112582, + "grad_norm": 5.102241516113281, + "learning_rate": 6.704668870144458e-07, + "loss": 0.3487, + "step": 14960 + }, + { + "epoch": 1.802528597230584, + "grad_norm": 3.8051202297210693, + "learning_rate": 6.699728851432442e-07, + "loss": 0.3373, + "step": 14970 + }, + { + "epoch": 1.8037326911499096, + "grad_norm": 4.386908054351807, + "learning_rate": 6.694786956019467e-07, + "loss": 0.3646, + "step": 14980 + }, + { + "epoch": 1.8049367850692355, + "grad_norm": 4.566622257232666, + "learning_rate": 6.689843189361962e-07, + "loss": 0.3698, + "step": 14990 + }, + { + "epoch": 1.8061408789885611, + "grad_norm": 4.474935054779053, + "learning_rate": 6.684897556918434e-07, + "loss": 0.3567, + "step": 15000 + }, + { + "epoch": 1.8073449729078868, + "grad_norm": 4.712069034576416, + "learning_rate": 6.67995006414944e-07, + "loss": 0.3573, + "step": 15010 + }, + { + "epoch": 1.8085490668272124, + "grad_norm": 4.497696876525879, + "learning_rate": 6.675000716517595e-07, + "loss": 0.3373, + "step": 15020 + }, + { + "epoch": 1.8097531607465382, + "grad_norm": 4.327920436859131, + "learning_rate": 6.670049519487565e-07, + "loss": 0.3689, + "step": 15030 + }, + { + "epoch": 1.810957254665864, + "grad_norm": 6.609139919281006, + "learning_rate": 6.665096478526053e-07, + "loss": 0.3465, + "step": 15040 + }, + { + "epoch": 1.8121613485851897, + "grad_norm": 4.8396196365356445, + "learning_rate": 6.6601415991018e-07, + "loss": 0.3628, + "step": 15050 + }, + { + "epoch": 1.8133654425045154, + "grad_norm": 5.569112777709961, + "learning_rate": 6.655184886685577e-07, + "loss": 0.3484, + "step": 15060 + }, + { + "epoch": 1.814569536423841, + "grad_norm": 4.458260536193848, + "learning_rate": 6.650226346750178e-07, + "loss": 0.3523, + "step": 15070 + }, + { + "epoch": 1.8157736303431666, + "grad_norm": 4.671230316162109, + "learning_rate": 6.645265984770417e-07, + "loss": 0.3501, + "step": 15080 + }, + { + "epoch": 1.8169777242624925, + "grad_norm": 4.7510504722595215, + "learning_rate": 6.640303806223116e-07, + "loss": 0.3565, + "step": 15090 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 4.930042266845703, + "learning_rate": 6.635339816587108e-07, + "loss": 0.3519, + "step": 15100 + }, + { + "epoch": 1.819385912101144, + "grad_norm": 4.401383876800537, + "learning_rate": 6.63037402134322e-07, + "loss": 0.3444, + "step": 15110 + }, + { + "epoch": 1.8205900060204696, + "grad_norm": 4.55552864074707, + "learning_rate": 6.625406425974277e-07, + "loss": 0.3593, + "step": 15120 + }, + { + "epoch": 1.8217940999397952, + "grad_norm": 4.647222995758057, + "learning_rate": 6.620437035965088e-07, + "loss": 0.3513, + "step": 15130 + }, + { + "epoch": 1.8229981938591209, + "grad_norm": 4.750911235809326, + "learning_rate": 6.615465856802446e-07, + "loss": 0.3754, + "step": 15140 + }, + { + "epoch": 1.8242022877784467, + "grad_norm": 3.9289968013763428, + "learning_rate": 6.610492893975117e-07, + "loss": 0.3511, + "step": 15150 + }, + { + "epoch": 1.8254063816977726, + "grad_norm": 3.834213972091675, + "learning_rate": 6.605518152973842e-07, + "loss": 0.3446, + "step": 15160 + }, + { + "epoch": 1.8266104756170982, + "grad_norm": 5.1060075759887695, + "learning_rate": 6.600541639291316e-07, + "loss": 0.3548, + "step": 15170 + }, + { + "epoch": 1.8278145695364238, + "grad_norm": 4.696617603302002, + "learning_rate": 6.595563358422202e-07, + "loss": 0.3576, + "step": 15180 + }, + { + "epoch": 1.8290186634557495, + "grad_norm": 4.141697883605957, + "learning_rate": 6.590583315863105e-07, + "loss": 0.3513, + "step": 15190 + }, + { + "epoch": 1.8302227573750751, + "grad_norm": 5.357382774353027, + "learning_rate": 6.58560151711258e-07, + "loss": 0.3508, + "step": 15200 + }, + { + "epoch": 1.831426851294401, + "grad_norm": 4.808011054992676, + "learning_rate": 6.58061796767112e-07, + "loss": 0.3568, + "step": 15210 + }, + { + "epoch": 1.8326309452137268, + "grad_norm": 4.633763790130615, + "learning_rate": 6.575632673041151e-07, + "loss": 0.355, + "step": 15220 + }, + { + "epoch": 1.8338350391330525, + "grad_norm": 4.953246116638184, + "learning_rate": 6.570645638727026e-07, + "loss": 0.3604, + "step": 15230 + }, + { + "epoch": 1.835039133052378, + "grad_norm": 4.354135513305664, + "learning_rate": 6.565656870235019e-07, + "loss": 0.337, + "step": 15240 + }, + { + "epoch": 1.8362432269717037, + "grad_norm": 5.245918273925781, + "learning_rate": 6.560666373073316e-07, + "loss": 0.3711, + "step": 15250 + }, + { + "epoch": 1.8374473208910294, + "grad_norm": 5.532114028930664, + "learning_rate": 6.555674152752016e-07, + "loss": 0.3618, + "step": 15260 + }, + { + "epoch": 1.8386514148103552, + "grad_norm": 5.3348212242126465, + "learning_rate": 6.55068021478312e-07, + "loss": 0.3646, + "step": 15270 + }, + { + "epoch": 1.839855508729681, + "grad_norm": 4.423579216003418, + "learning_rate": 6.54568456468052e-07, + "loss": 0.3522, + "step": 15280 + }, + { + "epoch": 1.8410596026490067, + "grad_norm": 4.966454982757568, + "learning_rate": 6.540687207960005e-07, + "loss": 0.3592, + "step": 15290 + }, + { + "epoch": 1.8422636965683323, + "grad_norm": 4.406902313232422, + "learning_rate": 6.535688150139246e-07, + "loss": 0.3637, + "step": 15300 + }, + { + "epoch": 1.843467790487658, + "grad_norm": 4.565004348754883, + "learning_rate": 6.530687396737791e-07, + "loss": 0.343, + "step": 15310 + }, + { + "epoch": 1.8446718844069836, + "grad_norm": 4.898248672485352, + "learning_rate": 6.525684953277061e-07, + "loss": 0.3589, + "step": 15320 + }, + { + "epoch": 1.8458759783263095, + "grad_norm": 4.416904449462891, + "learning_rate": 6.520680825280344e-07, + "loss": 0.3297, + "step": 15330 + }, + { + "epoch": 1.8470800722456353, + "grad_norm": 4.844006538391113, + "learning_rate": 6.515675018272786e-07, + "loss": 0.3692, + "step": 15340 + }, + { + "epoch": 1.848284166164961, + "grad_norm": 4.351726531982422, + "learning_rate": 6.510667537781389e-07, + "loss": 0.3627, + "step": 15350 + }, + { + "epoch": 1.8494882600842866, + "grad_norm": 4.276306629180908, + "learning_rate": 6.505658389335e-07, + "loss": 0.3581, + "step": 15360 + }, + { + "epoch": 1.8506923540036122, + "grad_norm": 4.866278648376465, + "learning_rate": 6.500647578464311e-07, + "loss": 0.3756, + "step": 15370 + }, + { + "epoch": 1.8518964479229378, + "grad_norm": 4.005789279937744, + "learning_rate": 6.495635110701847e-07, + "loss": 0.3551, + "step": 15380 + }, + { + "epoch": 1.8531005418422637, + "grad_norm": 4.069939136505127, + "learning_rate": 6.490620991581963e-07, + "loss": 0.3426, + "step": 15390 + }, + { + "epoch": 1.8543046357615895, + "grad_norm": 5.377545356750488, + "learning_rate": 6.485605226640836e-07, + "loss": 0.363, + "step": 15400 + }, + { + "epoch": 1.8555087296809152, + "grad_norm": 4.171127796173096, + "learning_rate": 6.480587821416465e-07, + "loss": 0.3601, + "step": 15410 + }, + { + "epoch": 1.8567128236002408, + "grad_norm": 4.944298267364502, + "learning_rate": 6.475568781448654e-07, + "loss": 0.3445, + "step": 15420 + }, + { + "epoch": 1.8579169175195664, + "grad_norm": 4.719433784484863, + "learning_rate": 6.470548112279015e-07, + "loss": 0.349, + "step": 15430 + }, + { + "epoch": 1.859121011438892, + "grad_norm": 4.289638042449951, + "learning_rate": 6.465525819450959e-07, + "loss": 0.3675, + "step": 15440 + }, + { + "epoch": 1.860325105358218, + "grad_norm": 4.580896377563477, + "learning_rate": 6.46050190850969e-07, + "loss": 0.362, + "step": 15450 + }, + { + "epoch": 1.8615291992775438, + "grad_norm": 4.68642520904541, + "learning_rate": 6.455476385002195e-07, + "loss": 0.3544, + "step": 15460 + }, + { + "epoch": 1.8627332931968694, + "grad_norm": 4.221519470214844, + "learning_rate": 6.450449254477246e-07, + "loss": 0.3557, + "step": 15470 + }, + { + "epoch": 1.863937387116195, + "grad_norm": 5.103092670440674, + "learning_rate": 6.445420522485387e-07, + "loss": 0.3575, + "step": 15480 + }, + { + "epoch": 1.8651414810355207, + "grad_norm": 5.300514221191406, + "learning_rate": 6.440390194578933e-07, + "loss": 0.3655, + "step": 15490 + }, + { + "epoch": 1.8663455749548463, + "grad_norm": 5.2280049324035645, + "learning_rate": 6.435358276311955e-07, + "loss": 0.3615, + "step": 15500 + }, + { + "epoch": 1.8675496688741722, + "grad_norm": 4.393173694610596, + "learning_rate": 6.430324773240287e-07, + "loss": 0.3617, + "step": 15510 + }, + { + "epoch": 1.868753762793498, + "grad_norm": 3.9914498329162598, + "learning_rate": 6.425289690921508e-07, + "loss": 0.3482, + "step": 15520 + }, + { + "epoch": 1.8699578567128237, + "grad_norm": 4.967134475708008, + "learning_rate": 6.420253034914943e-07, + "loss": 0.3635, + "step": 15530 + }, + { + "epoch": 1.8711619506321493, + "grad_norm": 4.27791166305542, + "learning_rate": 6.415214810781653e-07, + "loss": 0.3508, + "step": 15540 + }, + { + "epoch": 1.872366044551475, + "grad_norm": 4.6500163078308105, + "learning_rate": 6.410175024084431e-07, + "loss": 0.3589, + "step": 15550 + }, + { + "epoch": 1.8735701384708006, + "grad_norm": 4.22102689743042, + "learning_rate": 6.405133680387797e-07, + "loss": 0.3558, + "step": 15560 + }, + { + "epoch": 1.8747742323901264, + "grad_norm": 4.9325947761535645, + "learning_rate": 6.400090785257987e-07, + "loss": 0.3696, + "step": 15570 + }, + { + "epoch": 1.8759783263094523, + "grad_norm": 3.8292155265808105, + "learning_rate": 6.395046344262951e-07, + "loss": 0.356, + "step": 15580 + }, + { + "epoch": 1.877182420228778, + "grad_norm": 4.739902973175049, + "learning_rate": 6.390000362972348e-07, + "loss": 0.3407, + "step": 15590 + }, + { + "epoch": 1.8783865141481035, + "grad_norm": 3.770754814147949, + "learning_rate": 6.384952846957535e-07, + "loss": 0.3502, + "step": 15600 + }, + { + "epoch": 1.8795906080674292, + "grad_norm": 4.367559432983398, + "learning_rate": 6.379903801791566e-07, + "loss": 0.3566, + "step": 15610 + }, + { + "epoch": 1.8807947019867548, + "grad_norm": 5.16295862197876, + "learning_rate": 6.374853233049182e-07, + "loss": 0.3668, + "step": 15620 + }, + { + "epoch": 1.8819987959060807, + "grad_norm": 4.346946716308594, + "learning_rate": 6.369801146306802e-07, + "loss": 0.3483, + "step": 15630 + }, + { + "epoch": 1.8832028898254065, + "grad_norm": 4.716429710388184, + "learning_rate": 6.36474754714253e-07, + "loss": 0.3452, + "step": 15640 + }, + { + "epoch": 1.8844069837447321, + "grad_norm": 4.5193891525268555, + "learning_rate": 6.359692441136131e-07, + "loss": 0.361, + "step": 15650 + }, + { + "epoch": 1.8856110776640578, + "grad_norm": 3.9874355792999268, + "learning_rate": 6.354635833869042e-07, + "loss": 0.358, + "step": 15660 + }, + { + "epoch": 1.8868151715833834, + "grad_norm": 4.598703861236572, + "learning_rate": 6.349577730924349e-07, + "loss": 0.35, + "step": 15670 + }, + { + "epoch": 1.8880192655027093, + "grad_norm": 5.374682426452637, + "learning_rate": 6.344518137886798e-07, + "loss": 0.3639, + "step": 15680 + }, + { + "epoch": 1.889223359422035, + "grad_norm": 6.002275466918945, + "learning_rate": 6.339457060342772e-07, + "loss": 0.3546, + "step": 15690 + }, + { + "epoch": 1.8904274533413608, + "grad_norm": 4.864243984222412, + "learning_rate": 6.3343945038803e-07, + "loss": 0.3543, + "step": 15700 + }, + { + "epoch": 1.8916315472606864, + "grad_norm": 3.9879305362701416, + "learning_rate": 6.329330474089039e-07, + "loss": 0.3549, + "step": 15710 + }, + { + "epoch": 1.892835641180012, + "grad_norm": 4.457694053649902, + "learning_rate": 6.324264976560277e-07, + "loss": 0.3584, + "step": 15720 + }, + { + "epoch": 1.8940397350993377, + "grad_norm": 3.741135835647583, + "learning_rate": 6.319198016886918e-07, + "loss": 0.3618, + "step": 15730 + }, + { + "epoch": 1.8952438290186635, + "grad_norm": 4.002588272094727, + "learning_rate": 6.314129600663484e-07, + "loss": 0.3492, + "step": 15740 + }, + { + "epoch": 1.8964479229379891, + "grad_norm": 4.551817893981934, + "learning_rate": 6.309059733486102e-07, + "loss": 0.3567, + "step": 15750 + }, + { + "epoch": 1.897652016857315, + "grad_norm": 4.268725872039795, + "learning_rate": 6.303988420952505e-07, + "loss": 0.3591, + "step": 15760 + }, + { + "epoch": 1.8988561107766406, + "grad_norm": 4.963777542114258, + "learning_rate": 6.298915668662017e-07, + "loss": 0.3551, + "step": 15770 + }, + { + "epoch": 1.9000602046959663, + "grad_norm": 4.293519973754883, + "learning_rate": 6.293841482215558e-07, + "loss": 0.3586, + "step": 15780 + }, + { + "epoch": 1.901264298615292, + "grad_norm": 4.556762218475342, + "learning_rate": 6.288765867215625e-07, + "loss": 0.3538, + "step": 15790 + }, + { + "epoch": 1.9024683925346177, + "grad_norm": 3.792178153991699, + "learning_rate": 6.283688829266297e-07, + "loss": 0.3331, + "step": 15800 + }, + { + "epoch": 1.9036724864539434, + "grad_norm": 5.197310447692871, + "learning_rate": 6.278610373973219e-07, + "loss": 0.3515, + "step": 15810 + }, + { + "epoch": 1.9048765803732692, + "grad_norm": 5.082350730895996, + "learning_rate": 6.273530506943609e-07, + "loss": 0.3389, + "step": 15820 + }, + { + "epoch": 1.9060806742925949, + "grad_norm": 4.892045021057129, + "learning_rate": 6.268449233786236e-07, + "loss": 0.3531, + "step": 15830 + }, + { + "epoch": 1.9072847682119205, + "grad_norm": 4.555123805999756, + "learning_rate": 6.263366560111423e-07, + "loss": 0.3414, + "step": 15840 + }, + { + "epoch": 1.9084888621312461, + "grad_norm": 4.728994846343994, + "learning_rate": 6.258282491531043e-07, + "loss": 0.3556, + "step": 15850 + }, + { + "epoch": 1.909692956050572, + "grad_norm": 4.745967388153076, + "learning_rate": 6.253197033658507e-07, + "loss": 0.343, + "step": 15860 + }, + { + "epoch": 1.9108970499698976, + "grad_norm": 4.600861072540283, + "learning_rate": 6.248110192108757e-07, + "loss": 0.3475, + "step": 15870 + }, + { + "epoch": 1.9121011438892235, + "grad_norm": 4.099234580993652, + "learning_rate": 6.243021972498269e-07, + "loss": 0.3624, + "step": 15880 + }, + { + "epoch": 1.913305237808549, + "grad_norm": 4.272284030914307, + "learning_rate": 6.237932380445034e-07, + "loss": 0.3565, + "step": 15890 + }, + { + "epoch": 1.9145093317278747, + "grad_norm": 3.7602131366729736, + "learning_rate": 6.232841421568565e-07, + "loss": 0.3499, + "step": 15900 + }, + { + "epoch": 1.9157134256472004, + "grad_norm": 4.971080303192139, + "learning_rate": 6.227749101489877e-07, + "loss": 0.3701, + "step": 15910 + }, + { + "epoch": 1.9169175195665262, + "grad_norm": 5.319652080535889, + "learning_rate": 6.222655425831495e-07, + "loss": 0.3451, + "step": 15920 + }, + { + "epoch": 1.9181216134858519, + "grad_norm": 4.283812522888184, + "learning_rate": 6.217560400217433e-07, + "loss": 0.3559, + "step": 15930 + }, + { + "epoch": 1.9193257074051777, + "grad_norm": 5.055164813995361, + "learning_rate": 6.212464030273204e-07, + "loss": 0.3562, + "step": 15940 + }, + { + "epoch": 1.9205298013245033, + "grad_norm": 4.813416004180908, + "learning_rate": 6.207366321625798e-07, + "loss": 0.3606, + "step": 15950 + }, + { + "epoch": 1.921733895243829, + "grad_norm": 4.402296543121338, + "learning_rate": 6.202267279903686e-07, + "loss": 0.353, + "step": 15960 + }, + { + "epoch": 1.9229379891631546, + "grad_norm": 4.458485126495361, + "learning_rate": 6.197166910736814e-07, + "loss": 0.3523, + "step": 15970 + }, + { + "epoch": 1.9241420830824805, + "grad_norm": 3.5323286056518555, + "learning_rate": 6.192065219756587e-07, + "loss": 0.357, + "step": 15980 + }, + { + "epoch": 1.925346177001806, + "grad_norm": 4.047741413116455, + "learning_rate": 6.186962212595876e-07, + "loss": 0.3513, + "step": 15990 + }, + { + "epoch": 1.926550270921132, + "grad_norm": 4.608432769775391, + "learning_rate": 6.181857894889e-07, + "loss": 0.3556, + "step": 16000 + }, + { + "epoch": 1.9277543648404576, + "grad_norm": 4.246164321899414, + "learning_rate": 6.17675227227173e-07, + "loss": 0.3274, + "step": 16010 + }, + { + "epoch": 1.9289584587597832, + "grad_norm": 4.55797004699707, + "learning_rate": 6.171645350381272e-07, + "loss": 0.3537, + "step": 16020 + }, + { + "epoch": 1.9301625526791089, + "grad_norm": 4.349902629852295, + "learning_rate": 6.166537134856272e-07, + "loss": 0.3454, + "step": 16030 + }, + { + "epoch": 1.9313666465984347, + "grad_norm": 4.9922614097595215, + "learning_rate": 6.161427631336799e-07, + "loss": 0.3377, + "step": 16040 + }, + { + "epoch": 1.9325707405177603, + "grad_norm": 4.467525005340576, + "learning_rate": 6.156316845464351e-07, + "loss": 0.345, + "step": 16050 + }, + { + "epoch": 1.9337748344370862, + "grad_norm": 4.589630603790283, + "learning_rate": 6.151204782881835e-07, + "loss": 0.3393, + "step": 16060 + }, + { + "epoch": 1.9349789283564118, + "grad_norm": 4.475553035736084, + "learning_rate": 6.146091449233571e-07, + "loss": 0.3544, + "step": 16070 + }, + { + "epoch": 1.9361830222757375, + "grad_norm": 4.827112197875977, + "learning_rate": 6.140976850165283e-07, + "loss": 0.3447, + "step": 16080 + }, + { + "epoch": 1.937387116195063, + "grad_norm": 3.81062388420105, + "learning_rate": 6.135860991324092e-07, + "loss": 0.3493, + "step": 16090 + }, + { + "epoch": 1.938591210114389, + "grad_norm": 4.450663089752197, + "learning_rate": 6.130743878358505e-07, + "loss": 0.3601, + "step": 16100 + }, + { + "epoch": 1.9397953040337146, + "grad_norm": 3.878636598587036, + "learning_rate": 6.125625516918421e-07, + "loss": 0.3638, + "step": 16110 + }, + { + "epoch": 1.9409993979530404, + "grad_norm": 4.681748390197754, + "learning_rate": 6.120505912655114e-07, + "loss": 0.3542, + "step": 16120 + }, + { + "epoch": 1.942203491872366, + "grad_norm": 5.228558540344238, + "learning_rate": 6.115385071221231e-07, + "loss": 0.3538, + "step": 16130 + }, + { + "epoch": 1.9434075857916917, + "grad_norm": 5.1694488525390625, + "learning_rate": 6.110262998270781e-07, + "loss": 0.3689, + "step": 16140 + }, + { + "epoch": 1.9446116797110173, + "grad_norm": 4.253943920135498, + "learning_rate": 6.10513969945914e-07, + "loss": 0.3518, + "step": 16150 + }, + { + "epoch": 1.9458157736303432, + "grad_norm": 4.636354446411133, + "learning_rate": 6.100015180443031e-07, + "loss": 0.3643, + "step": 16160 + }, + { + "epoch": 1.9470198675496688, + "grad_norm": 3.8941125869750977, + "learning_rate": 6.094889446880529e-07, + "loss": 0.3444, + "step": 16170 + }, + { + "epoch": 1.9482239614689947, + "grad_norm": 4.6928391456604, + "learning_rate": 6.089762504431046e-07, + "loss": 0.3541, + "step": 16180 + }, + { + "epoch": 1.9494280553883203, + "grad_norm": 4.19013786315918, + "learning_rate": 6.084634358755334e-07, + "loss": 0.357, + "step": 16190 + }, + { + "epoch": 1.950632149307646, + "grad_norm": 4.565307140350342, + "learning_rate": 6.079505015515465e-07, + "loss": 0.3419, + "step": 16200 + }, + { + "epoch": 1.9518362432269716, + "grad_norm": 5.345344543457031, + "learning_rate": 6.074374480374843e-07, + "loss": 0.3569, + "step": 16210 + }, + { + "epoch": 1.9530403371462974, + "grad_norm": 4.672290802001953, + "learning_rate": 6.069242758998181e-07, + "loss": 0.3564, + "step": 16220 + }, + { + "epoch": 1.954244431065623, + "grad_norm": 4.522906303405762, + "learning_rate": 6.064109857051505e-07, + "loss": 0.35, + "step": 16230 + }, + { + "epoch": 1.955448524984949, + "grad_norm": 4.692704200744629, + "learning_rate": 6.058975780202143e-07, + "loss": 0.334, + "step": 16240 + }, + { + "epoch": 1.9566526189042746, + "grad_norm": 4.350996971130371, + "learning_rate": 6.053840534118722e-07, + "loss": 0.3512, + "step": 16250 + }, + { + "epoch": 1.9578567128236002, + "grad_norm": 4.869346618652344, + "learning_rate": 6.04870412447116e-07, + "loss": 0.3415, + "step": 16260 + }, + { + "epoch": 1.9590608067429258, + "grad_norm": 4.5982818603515625, + "learning_rate": 6.043566556930655e-07, + "loss": 0.3697, + "step": 16270 + }, + { + "epoch": 1.9602649006622517, + "grad_norm": 4.133756637573242, + "learning_rate": 6.038427837169688e-07, + "loss": 0.3498, + "step": 16280 + }, + { + "epoch": 1.9614689945815773, + "grad_norm": 4.6877546310424805, + "learning_rate": 6.033287970862013e-07, + "loss": 0.3622, + "step": 16290 + }, + { + "epoch": 1.9626730885009032, + "grad_norm": 5.100693702697754, + "learning_rate": 6.028146963682648e-07, + "loss": 0.3571, + "step": 16300 + }, + { + "epoch": 1.9638771824202288, + "grad_norm": 5.0933685302734375, + "learning_rate": 6.023004821307867e-07, + "loss": 0.3247, + "step": 16310 + }, + { + "epoch": 1.9650812763395544, + "grad_norm": 3.7194926738739014, + "learning_rate": 6.017861549415207e-07, + "loss": 0.3519, + "step": 16320 + }, + { + "epoch": 1.96628537025888, + "grad_norm": 4.424744606018066, + "learning_rate": 6.012717153683442e-07, + "loss": 0.3401, + "step": 16330 + }, + { + "epoch": 1.967489464178206, + "grad_norm": 3.9198262691497803, + "learning_rate": 6.007571639792593e-07, + "loss": 0.3434, + "step": 16340 + }, + { + "epoch": 1.9686935580975318, + "grad_norm": 3.9350152015686035, + "learning_rate": 6.002425013423913e-07, + "loss": 0.3447, + "step": 16350 + }, + { + "epoch": 1.9698976520168574, + "grad_norm": 4.852246284484863, + "learning_rate": 5.997277280259885e-07, + "loss": 0.3457, + "step": 16360 + }, + { + "epoch": 1.971101745936183, + "grad_norm": 4.658691883087158, + "learning_rate": 5.992128445984212e-07, + "loss": 0.3692, + "step": 16370 + }, + { + "epoch": 1.9723058398555087, + "grad_norm": 4.637414932250977, + "learning_rate": 5.986978516281815e-07, + "loss": 0.3555, + "step": 16380 + }, + { + "epoch": 1.9735099337748343, + "grad_norm": 4.982326984405518, + "learning_rate": 5.981827496838822e-07, + "loss": 0.3526, + "step": 16390 + }, + { + "epoch": 1.9747140276941602, + "grad_norm": 4.729382514953613, + "learning_rate": 5.976675393342566e-07, + "loss": 0.3558, + "step": 16400 + }, + { + "epoch": 1.975918121613486, + "grad_norm": 4.774322509765625, + "learning_rate": 5.971522211481575e-07, + "loss": 0.358, + "step": 16410 + }, + { + "epoch": 1.9771222155328116, + "grad_norm": 4.948471546173096, + "learning_rate": 5.966367956945572e-07, + "loss": 0.359, + "step": 16420 + }, + { + "epoch": 1.9783263094521373, + "grad_norm": 4.0199198722839355, + "learning_rate": 5.961212635425459e-07, + "loss": 0.3423, + "step": 16430 + }, + { + "epoch": 1.979530403371463, + "grad_norm": 4.141156196594238, + "learning_rate": 5.956056252613319e-07, + "loss": 0.3475, + "step": 16440 + }, + { + "epoch": 1.9807344972907885, + "grad_norm": 4.316824913024902, + "learning_rate": 5.950898814202407e-07, + "loss": 0.3436, + "step": 16450 + }, + { + "epoch": 1.9819385912101144, + "grad_norm": 5.594763278961182, + "learning_rate": 5.945740325887144e-07, + "loss": 0.3435, + "step": 16460 + }, + { + "epoch": 1.9831426851294403, + "grad_norm": 4.995075702667236, + "learning_rate": 5.940580793363105e-07, + "loss": 0.3539, + "step": 16470 + }, + { + "epoch": 1.9843467790487659, + "grad_norm": 4.139880180358887, + "learning_rate": 5.935420222327028e-07, + "loss": 0.3544, + "step": 16480 + }, + { + "epoch": 1.9855508729680915, + "grad_norm": 3.917797088623047, + "learning_rate": 5.930258618476785e-07, + "loss": 0.3331, + "step": 16490 + }, + { + "epoch": 1.9867549668874172, + "grad_norm": 5.234194755554199, + "learning_rate": 5.9250959875114e-07, + "loss": 0.3477, + "step": 16500 + }, + { + "epoch": 1.9879590608067428, + "grad_norm": 4.324552059173584, + "learning_rate": 5.919932335131022e-07, + "loss": 0.341, + "step": 16510 + }, + { + "epoch": 1.9891631547260686, + "grad_norm": 5.321447849273682, + "learning_rate": 5.914767667036936e-07, + "loss": 0.3606, + "step": 16520 + }, + { + "epoch": 1.9903672486453945, + "grad_norm": 4.159404277801514, + "learning_rate": 5.90960198893154e-07, + "loss": 0.3484, + "step": 16530 + }, + { + "epoch": 1.9915713425647201, + "grad_norm": 4.632839202880859, + "learning_rate": 5.904435306518354e-07, + "loss": 0.35, + "step": 16540 + }, + { + "epoch": 1.9927754364840458, + "grad_norm": 4.1767168045043945, + "learning_rate": 5.899267625502004e-07, + "loss": 0.356, + "step": 16550 + }, + { + "epoch": 1.9939795304033714, + "grad_norm": 4.770878314971924, + "learning_rate": 5.894098951588218e-07, + "loss": 0.3338, + "step": 16560 + }, + { + "epoch": 1.995183624322697, + "grad_norm": 4.481430530548096, + "learning_rate": 5.888929290483821e-07, + "loss": 0.3569, + "step": 16570 + }, + { + "epoch": 1.9963877182420229, + "grad_norm": 4.496611595153809, + "learning_rate": 5.883758647896729e-07, + "loss": 0.3602, + "step": 16580 + }, + { + "epoch": 1.9975918121613487, + "grad_norm": 3.9505410194396973, + "learning_rate": 5.878587029535942e-07, + "loss": 0.3403, + "step": 16590 + }, + { + "epoch": 1.9987959060806744, + "grad_norm": 4.308087348937988, + "learning_rate": 5.873414441111532e-07, + "loss": 0.3556, + "step": 16600 + }, + { + "epoch": 2.0, + "grad_norm": 4.440168857574463, + "learning_rate": 5.868240888334652e-07, + "loss": 0.3312, + "step": 16610 + }, + { + "epoch": 2.0012040939193256, + "grad_norm": 4.038889408111572, + "learning_rate": 5.863066376917508e-07, + "loss": 0.3224, + "step": 16620 + }, + { + "epoch": 2.0024081878386513, + "grad_norm": 4.833006381988525, + "learning_rate": 5.857890912573376e-07, + "loss": 0.3001, + "step": 16630 + }, + { + "epoch": 2.0036122817579773, + "grad_norm": 4.160131931304932, + "learning_rate": 5.852714501016572e-07, + "loss": 0.2985, + "step": 16640 + }, + { + "epoch": 2.004816375677303, + "grad_norm": 5.080901622772217, + "learning_rate": 5.84753714796247e-07, + "loss": 0.3228, + "step": 16650 + }, + { + "epoch": 2.0060204695966286, + "grad_norm": 4.37393856048584, + "learning_rate": 5.842358859127478e-07, + "loss": 0.3036, + "step": 16660 + }, + { + "epoch": 2.0072245635159542, + "grad_norm": 4.473939895629883, + "learning_rate": 5.837179640229032e-07, + "loss": 0.3135, + "step": 16670 + }, + { + "epoch": 2.00842865743528, + "grad_norm": 5.297366619110107, + "learning_rate": 5.831999496985605e-07, + "loss": 0.3059, + "step": 16680 + }, + { + "epoch": 2.0096327513546055, + "grad_norm": 5.174331188201904, + "learning_rate": 5.826818435116683e-07, + "loss": 0.3123, + "step": 16690 + }, + { + "epoch": 2.0108368452739316, + "grad_norm": 4.679065704345703, + "learning_rate": 5.821636460342769e-07, + "loss": 0.3232, + "step": 16700 + }, + { + "epoch": 2.012040939193257, + "grad_norm": 4.446617126464844, + "learning_rate": 5.816453578385375e-07, + "loss": 0.3063, + "step": 16710 + }, + { + "epoch": 2.013245033112583, + "grad_norm": 5.05123233795166, + "learning_rate": 5.811269794967014e-07, + "loss": 0.3095, + "step": 16720 + }, + { + "epoch": 2.0144491270319085, + "grad_norm": 4.649383544921875, + "learning_rate": 5.806085115811191e-07, + "loss": 0.309, + "step": 16730 + }, + { + "epoch": 2.015653220951234, + "grad_norm": 4.328246116638184, + "learning_rate": 5.800899546642406e-07, + "loss": 0.2981, + "step": 16740 + }, + { + "epoch": 2.0168573148705597, + "grad_norm": 4.504574775695801, + "learning_rate": 5.795713093186136e-07, + "loss": 0.3162, + "step": 16750 + }, + { + "epoch": 2.018061408789886, + "grad_norm": 4.636085033416748, + "learning_rate": 5.790525761168839e-07, + "loss": 0.318, + "step": 16760 + }, + { + "epoch": 2.0192655027092115, + "grad_norm": 5.4193291664123535, + "learning_rate": 5.785337556317938e-07, + "loss": 0.3216, + "step": 16770 + }, + { + "epoch": 2.020469596628537, + "grad_norm": 4.318239212036133, + "learning_rate": 5.780148484361826e-07, + "loss": 0.3018, + "step": 16780 + }, + { + "epoch": 2.0216736905478627, + "grad_norm": 4.4032087326049805, + "learning_rate": 5.774958551029847e-07, + "loss": 0.3078, + "step": 16790 + }, + { + "epoch": 2.0228777844671884, + "grad_norm": 4.946054458618164, + "learning_rate": 5.769767762052301e-07, + "loss": 0.3155, + "step": 16800 + }, + { + "epoch": 2.024081878386514, + "grad_norm": 4.1051344871521, + "learning_rate": 5.764576123160429e-07, + "loss": 0.3183, + "step": 16810 + }, + { + "epoch": 2.02528597230584, + "grad_norm": 4.6641459465026855, + "learning_rate": 5.759383640086415e-07, + "loss": 0.3063, + "step": 16820 + }, + { + "epoch": 2.0264900662251657, + "grad_norm": 4.728779315948486, + "learning_rate": 5.75419031856337e-07, + "loss": 0.3153, + "step": 16830 + }, + { + "epoch": 2.0276941601444913, + "grad_norm": 5.103392124176025, + "learning_rate": 5.748996164325331e-07, + "loss": 0.304, + "step": 16840 + }, + { + "epoch": 2.028898254063817, + "grad_norm": 5.283243656158447, + "learning_rate": 5.743801183107261e-07, + "loss": 0.3188, + "step": 16850 + }, + { + "epoch": 2.0301023479831426, + "grad_norm": 4.704992294311523, + "learning_rate": 5.73860538064503e-07, + "loss": 0.306, + "step": 16860 + }, + { + "epoch": 2.0313064419024682, + "grad_norm": 5.523532390594482, + "learning_rate": 5.733408762675414e-07, + "loss": 0.3164, + "step": 16870 + }, + { + "epoch": 2.0325105358217943, + "grad_norm": 4.29448127746582, + "learning_rate": 5.728211334936093e-07, + "loss": 0.3011, + "step": 16880 + }, + { + "epoch": 2.03371462974112, + "grad_norm": 4.910971164703369, + "learning_rate": 5.723013103165642e-07, + "loss": 0.3093, + "step": 16890 + }, + { + "epoch": 2.0349187236604456, + "grad_norm": 4.527739524841309, + "learning_rate": 5.717814073103519e-07, + "loss": 0.2994, + "step": 16900 + }, + { + "epoch": 2.036122817579771, + "grad_norm": 4.409666061401367, + "learning_rate": 5.712614250490064e-07, + "loss": 0.3165, + "step": 16910 + }, + { + "epoch": 2.037326911499097, + "grad_norm": 4.129342079162598, + "learning_rate": 5.707413641066497e-07, + "loss": 0.3159, + "step": 16920 + }, + { + "epoch": 2.0385310054184225, + "grad_norm": 4.361571788787842, + "learning_rate": 5.702212250574904e-07, + "loss": 0.3008, + "step": 16930 + }, + { + "epoch": 2.0397350993377485, + "grad_norm": 4.482879638671875, + "learning_rate": 5.697010084758232e-07, + "loss": 0.3169, + "step": 16940 + }, + { + "epoch": 2.040939193257074, + "grad_norm": 4.7954535484313965, + "learning_rate": 5.691807149360285e-07, + "loss": 0.3057, + "step": 16950 + }, + { + "epoch": 2.0421432871764, + "grad_norm": 4.840571403503418, + "learning_rate": 5.686603450125717e-07, + "loss": 0.2973, + "step": 16960 + }, + { + "epoch": 2.0433473810957254, + "grad_norm": 4.597223281860352, + "learning_rate": 5.681398992800024e-07, + "loss": 0.3144, + "step": 16970 + }, + { + "epoch": 2.044551475015051, + "grad_norm": 4.794790744781494, + "learning_rate": 5.676193783129542e-07, + "loss": 0.3087, + "step": 16980 + }, + { + "epoch": 2.0457555689343767, + "grad_norm": 4.340571403503418, + "learning_rate": 5.670987826861435e-07, + "loss": 0.3083, + "step": 16990 + }, + { + "epoch": 2.046959662853703, + "grad_norm": 4.629497051239014, + "learning_rate": 5.665781129743693e-07, + "loss": 0.3088, + "step": 17000 + }, + { + "epoch": 2.0481637567730284, + "grad_norm": 4.827451229095459, + "learning_rate": 5.660573697525121e-07, + "loss": 0.3039, + "step": 17010 + }, + { + "epoch": 2.049367850692354, + "grad_norm": 4.8336381912231445, + "learning_rate": 5.655365535955342e-07, + "loss": 0.306, + "step": 17020 + }, + { + "epoch": 2.0505719446116797, + "grad_norm": 5.4790940284729, + "learning_rate": 5.650156650784777e-07, + "loss": 0.3129, + "step": 17030 + }, + { + "epoch": 2.0517760385310053, + "grad_norm": 3.705552577972412, + "learning_rate": 5.64494704776465e-07, + "loss": 0.3062, + "step": 17040 + }, + { + "epoch": 2.052980132450331, + "grad_norm": 4.869053840637207, + "learning_rate": 5.639736732646976e-07, + "loss": 0.3169, + "step": 17050 + }, + { + "epoch": 2.054184226369657, + "grad_norm": 4.759436130523682, + "learning_rate": 5.634525711184556e-07, + "loss": 0.3129, + "step": 17060 + }, + { + "epoch": 2.0553883202889827, + "grad_norm": 4.388055324554443, + "learning_rate": 5.629313989130975e-07, + "loss": 0.3026, + "step": 17070 + }, + { + "epoch": 2.0565924142083083, + "grad_norm": 5.617096900939941, + "learning_rate": 5.624101572240587e-07, + "loss": 0.3064, + "step": 17080 + }, + { + "epoch": 2.057796508127634, + "grad_norm": 4.787253379821777, + "learning_rate": 5.618888466268513e-07, + "loss": 0.3174, + "step": 17090 + }, + { + "epoch": 2.0590006020469596, + "grad_norm": 4.347087383270264, + "learning_rate": 5.613674676970638e-07, + "loss": 0.3028, + "step": 17100 + }, + { + "epoch": 2.060204695966285, + "grad_norm": 4.601030349731445, + "learning_rate": 5.608460210103598e-07, + "loss": 0.3136, + "step": 17110 + }, + { + "epoch": 2.0614087898856113, + "grad_norm": 4.6767048835754395, + "learning_rate": 5.603245071424783e-07, + "loss": 0.3126, + "step": 17120 + }, + { + "epoch": 2.062612883804937, + "grad_norm": 5.636801719665527, + "learning_rate": 5.598029266692315e-07, + "loss": 0.3107, + "step": 17130 + }, + { + "epoch": 2.0638169777242625, + "grad_norm": 5.514817714691162, + "learning_rate": 5.592812801665061e-07, + "loss": 0.3191, + "step": 17140 + }, + { + "epoch": 2.065021071643588, + "grad_norm": 4.12761116027832, + "learning_rate": 5.587595682102611e-07, + "loss": 0.3119, + "step": 17150 + }, + { + "epoch": 2.066225165562914, + "grad_norm": 4.940089702606201, + "learning_rate": 5.582377913765283e-07, + "loss": 0.3072, + "step": 17160 + }, + { + "epoch": 2.0674292594822394, + "grad_norm": 4.235925674438477, + "learning_rate": 5.577159502414103e-07, + "loss": 0.3168, + "step": 17170 + }, + { + "epoch": 2.0686333534015655, + "grad_norm": 5.036463260650635, + "learning_rate": 5.57194045381082e-07, + "loss": 0.3236, + "step": 17180 + }, + { + "epoch": 2.069837447320891, + "grad_norm": 3.9009006023406982, + "learning_rate": 5.56672077371787e-07, + "loss": 0.3111, + "step": 17190 + }, + { + "epoch": 2.0710415412402168, + "grad_norm": 4.592634677886963, + "learning_rate": 5.5615004678984e-07, + "loss": 0.3001, + "step": 17200 + }, + { + "epoch": 2.0722456351595424, + "grad_norm": 4.5537004470825195, + "learning_rate": 5.556279542116242e-07, + "loss": 0.305, + "step": 17210 + }, + { + "epoch": 2.073449729078868, + "grad_norm": 4.557441711425781, + "learning_rate": 5.551058002135913e-07, + "loss": 0.2978, + "step": 17220 + }, + { + "epoch": 2.0746538229981937, + "grad_norm": 3.7024407386779785, + "learning_rate": 5.545835853722608e-07, + "loss": 0.3134, + "step": 17230 + }, + { + "epoch": 2.0758579169175198, + "grad_norm": 5.503789901733398, + "learning_rate": 5.540613102642195e-07, + "loss": 0.3217, + "step": 17240 + }, + { + "epoch": 2.0770620108368454, + "grad_norm": 4.864404678344727, + "learning_rate": 5.535389754661208e-07, + "loss": 0.2983, + "step": 17250 + }, + { + "epoch": 2.078266104756171, + "grad_norm": 5.232902526855469, + "learning_rate": 5.530165815546835e-07, + "loss": 0.3154, + "step": 17260 + }, + { + "epoch": 2.0794701986754967, + "grad_norm": 4.34998083114624, + "learning_rate": 5.524941291066923e-07, + "loss": 0.3078, + "step": 17270 + }, + { + "epoch": 2.0806742925948223, + "grad_norm": 4.243396282196045, + "learning_rate": 5.519716186989962e-07, + "loss": 0.2971, + "step": 17280 + }, + { + "epoch": 2.081878386514148, + "grad_norm": 4.376738548278809, + "learning_rate": 5.514490509085083e-07, + "loss": 0.3081, + "step": 17290 + }, + { + "epoch": 2.083082480433474, + "grad_norm": 4.597198486328125, + "learning_rate": 5.50926426312205e-07, + "loss": 0.3279, + "step": 17300 + }, + { + "epoch": 2.0842865743527996, + "grad_norm": 4.825913906097412, + "learning_rate": 5.504037454871258e-07, + "loss": 0.3164, + "step": 17310 + }, + { + "epoch": 2.0854906682721253, + "grad_norm": 4.312431812286377, + "learning_rate": 5.498810090103711e-07, + "loss": 0.29, + "step": 17320 + }, + { + "epoch": 2.086694762191451, + "grad_norm": 4.7181854248046875, + "learning_rate": 5.493582174591045e-07, + "loss": 0.2962, + "step": 17330 + }, + { + "epoch": 2.0878988561107765, + "grad_norm": 5.4123759269714355, + "learning_rate": 5.488353714105488e-07, + "loss": 0.3044, + "step": 17340 + }, + { + "epoch": 2.089102950030102, + "grad_norm": 4.742303371429443, + "learning_rate": 5.48312471441988e-07, + "loss": 0.287, + "step": 17350 + }, + { + "epoch": 2.0903070439494282, + "grad_norm": 3.8717334270477295, + "learning_rate": 5.477895181307651e-07, + "loss": 0.3205, + "step": 17360 + }, + { + "epoch": 2.091511137868754, + "grad_norm": 4.724112510681152, + "learning_rate": 5.472665120542824e-07, + "loss": 0.2851, + "step": 17370 + }, + { + "epoch": 2.0927152317880795, + "grad_norm": 5.797724723815918, + "learning_rate": 5.4674345379e-07, + "loss": 0.3136, + "step": 17380 + }, + { + "epoch": 2.093919325707405, + "grad_norm": 4.77787446975708, + "learning_rate": 5.462203439154361e-07, + "loss": 0.3059, + "step": 17390 + }, + { + "epoch": 2.0951234196267308, + "grad_norm": 4.670202732086182, + "learning_rate": 5.456971830081655e-07, + "loss": 0.3219, + "step": 17400 + }, + { + "epoch": 2.0963275135460564, + "grad_norm": 4.7208099365234375, + "learning_rate": 5.451739716458195e-07, + "loss": 0.3146, + "step": 17410 + }, + { + "epoch": 2.0975316074653825, + "grad_norm": 4.647831439971924, + "learning_rate": 5.446507104060851e-07, + "loss": 0.3266, + "step": 17420 + }, + { + "epoch": 2.098735701384708, + "grad_norm": 4.2992987632751465, + "learning_rate": 5.441273998667046e-07, + "loss": 0.3091, + "step": 17430 + }, + { + "epoch": 2.0999397953040337, + "grad_norm": 4.718204975128174, + "learning_rate": 5.436040406054742e-07, + "loss": 0.3103, + "step": 17440 + }, + { + "epoch": 2.1011438892233594, + "grad_norm": 4.716932773590088, + "learning_rate": 5.430806332002443e-07, + "loss": 0.3044, + "step": 17450 + }, + { + "epoch": 2.102347983142685, + "grad_norm": 4.856298923492432, + "learning_rate": 5.425571782289185e-07, + "loss": 0.3039, + "step": 17460 + }, + { + "epoch": 2.1035520770620106, + "grad_norm": 5.1161208152771, + "learning_rate": 5.420336762694524e-07, + "loss": 0.3014, + "step": 17470 + }, + { + "epoch": 2.1047561709813367, + "grad_norm": 4.895595550537109, + "learning_rate": 5.415101278998543e-07, + "loss": 0.3113, + "step": 17480 + }, + { + "epoch": 2.1059602649006623, + "grad_norm": 4.259979248046875, + "learning_rate": 5.409865336981832e-07, + "loss": 0.3158, + "step": 17490 + }, + { + "epoch": 2.107164358819988, + "grad_norm": 5.523928642272949, + "learning_rate": 5.404628942425484e-07, + "loss": 0.3293, + "step": 17500 + }, + { + "epoch": 2.1083684527393136, + "grad_norm": 5.490001201629639, + "learning_rate": 5.399392101111102e-07, + "loss": 0.3253, + "step": 17510 + }, + { + "epoch": 2.1095725466586392, + "grad_norm": 4.070251941680908, + "learning_rate": 5.39415481882077e-07, + "loss": 0.3341, + "step": 17520 + }, + { + "epoch": 2.110776640577965, + "grad_norm": 4.516000270843506, + "learning_rate": 5.388917101337069e-07, + "loss": 0.3115, + "step": 17530 + }, + { + "epoch": 2.111980734497291, + "grad_norm": 4.881539821624756, + "learning_rate": 5.383678954443056e-07, + "loss": 0.2962, + "step": 17540 + }, + { + "epoch": 2.1131848284166166, + "grad_norm": 4.361866474151611, + "learning_rate": 5.378440383922261e-07, + "loss": 0.2959, + "step": 17550 + }, + { + "epoch": 2.1143889223359422, + "grad_norm": 4.218469619750977, + "learning_rate": 5.373201395558683e-07, + "loss": 0.3004, + "step": 17560 + }, + { + "epoch": 2.115593016255268, + "grad_norm": 5.058506488800049, + "learning_rate": 5.367961995136782e-07, + "loss": 0.3177, + "step": 17570 + }, + { + "epoch": 2.1167971101745935, + "grad_norm": 5.340724468231201, + "learning_rate": 5.362722188441476e-07, + "loss": 0.3116, + "step": 17580 + }, + { + "epoch": 2.118001204093919, + "grad_norm": 4.867612361907959, + "learning_rate": 5.357481981258128e-07, + "loss": 0.3287, + "step": 17590 + }, + { + "epoch": 2.119205298013245, + "grad_norm": 4.499852180480957, + "learning_rate": 5.352241379372545e-07, + "loss": 0.3057, + "step": 17600 + }, + { + "epoch": 2.120409391932571, + "grad_norm": 5.446403980255127, + "learning_rate": 5.347000388570966e-07, + "loss": 0.3206, + "step": 17610 + }, + { + "epoch": 2.1216134858518965, + "grad_norm": 4.157654762268066, + "learning_rate": 5.341759014640067e-07, + "loss": 0.2985, + "step": 17620 + }, + { + "epoch": 2.122817579771222, + "grad_norm": 5.162617206573486, + "learning_rate": 5.336517263366939e-07, + "loss": 0.3057, + "step": 17630 + }, + { + "epoch": 2.1240216736905477, + "grad_norm": 4.874579906463623, + "learning_rate": 5.331275140539094e-07, + "loss": 0.3096, + "step": 17640 + }, + { + "epoch": 2.125225767609874, + "grad_norm": 4.7379350662231445, + "learning_rate": 5.326032651944453e-07, + "loss": 0.3178, + "step": 17650 + }, + { + "epoch": 2.1264298615291994, + "grad_norm": 4.660308361053467, + "learning_rate": 5.320789803371344e-07, + "loss": 0.3121, + "step": 17660 + }, + { + "epoch": 2.127633955448525, + "grad_norm": 4.264311790466309, + "learning_rate": 5.315546600608486e-07, + "loss": 0.3041, + "step": 17670 + }, + { + "epoch": 2.1288380493678507, + "grad_norm": 5.007218360900879, + "learning_rate": 5.310303049444995e-07, + "loss": 0.3133, + "step": 17680 + }, + { + "epoch": 2.1300421432871763, + "grad_norm": 4.878419399261475, + "learning_rate": 5.305059155670369e-07, + "loss": 0.307, + "step": 17690 + }, + { + "epoch": 2.131246237206502, + "grad_norm": 4.373286724090576, + "learning_rate": 5.299814925074485e-07, + "loss": 0.2988, + "step": 17700 + }, + { + "epoch": 2.1324503311258276, + "grad_norm": 4.705572128295898, + "learning_rate": 5.294570363447589e-07, + "loss": 0.3101, + "step": 17710 + }, + { + "epoch": 2.1336544250451537, + "grad_norm": 5.6706461906433105, + "learning_rate": 5.2893254765803e-07, + "loss": 0.3182, + "step": 17720 + }, + { + "epoch": 2.1348585189644793, + "grad_norm": 4.4038896560668945, + "learning_rate": 5.284080270263586e-07, + "loss": 0.3055, + "step": 17730 + }, + { + "epoch": 2.136062612883805, + "grad_norm": 4.746342658996582, + "learning_rate": 5.278834750288776e-07, + "loss": 0.3098, + "step": 17740 + }, + { + "epoch": 2.1372667068031306, + "grad_norm": 4.472485065460205, + "learning_rate": 5.273588922447543e-07, + "loss": 0.3192, + "step": 17750 + }, + { + "epoch": 2.138470800722456, + "grad_norm": 5.553606033325195, + "learning_rate": 5.268342792531897e-07, + "loss": 0.3328, + "step": 17760 + }, + { + "epoch": 2.1396748946417823, + "grad_norm": 5.298537731170654, + "learning_rate": 5.263096366334183e-07, + "loss": 0.3072, + "step": 17770 + }, + { + "epoch": 2.140878988561108, + "grad_norm": 4.98936128616333, + "learning_rate": 5.257849649647077e-07, + "loss": 0.3131, + "step": 17780 + }, + { + "epoch": 2.1420830824804336, + "grad_norm": 4.389891147613525, + "learning_rate": 5.252602648263569e-07, + "loss": 0.3142, + "step": 17790 + }, + { + "epoch": 2.143287176399759, + "grad_norm": 4.614076614379883, + "learning_rate": 5.24735536797697e-07, + "loss": 0.3075, + "step": 17800 + }, + { + "epoch": 2.144491270319085, + "grad_norm": 5.098964214324951, + "learning_rate": 5.242107814580893e-07, + "loss": 0.3125, + "step": 17810 + }, + { + "epoch": 2.1456953642384105, + "grad_norm": 4.502909183502197, + "learning_rate": 5.236859993869258e-07, + "loss": 0.2986, + "step": 17820 + }, + { + "epoch": 2.146899458157736, + "grad_norm": 5.02591609954834, + "learning_rate": 5.231611911636276e-07, + "loss": 0.294, + "step": 17830 + }, + { + "epoch": 2.148103552077062, + "grad_norm": 4.412136077880859, + "learning_rate": 5.226363573676447e-07, + "loss": 0.3085, + "step": 17840 + }, + { + "epoch": 2.149307645996388, + "grad_norm": 4.393168926239014, + "learning_rate": 5.221114985784558e-07, + "loss": 0.3145, + "step": 17850 + }, + { + "epoch": 2.1505117399157134, + "grad_norm": 4.741860389709473, + "learning_rate": 5.215866153755666e-07, + "loss": 0.3194, + "step": 17860 + }, + { + "epoch": 2.151715833835039, + "grad_norm": 4.4850006103515625, + "learning_rate": 5.210617083385101e-07, + "loss": 0.3015, + "step": 17870 + }, + { + "epoch": 2.1529199277543647, + "grad_norm": 5.466598033905029, + "learning_rate": 5.205367780468455e-07, + "loss": 0.311, + "step": 17880 + }, + { + "epoch": 2.1541240216736908, + "grad_norm": 5.164214611053467, + "learning_rate": 5.200118250801578e-07, + "loss": 0.3161, + "step": 17890 + }, + { + "epoch": 2.1553281155930164, + "grad_norm": 4.714061737060547, + "learning_rate": 5.194868500180567e-07, + "loss": 0.3171, + "step": 17900 + }, + { + "epoch": 2.156532209512342, + "grad_norm": 4.755367279052734, + "learning_rate": 5.189618534401768e-07, + "loss": 0.3059, + "step": 17910 + }, + { + "epoch": 2.1577363034316677, + "grad_norm": 4.605241298675537, + "learning_rate": 5.184368359261761e-07, + "loss": 0.3207, + "step": 17920 + }, + { + "epoch": 2.1589403973509933, + "grad_norm": 5.180820465087891, + "learning_rate": 5.179117980557357e-07, + "loss": 0.3097, + "step": 17930 + }, + { + "epoch": 2.160144491270319, + "grad_norm": 5.053746700286865, + "learning_rate": 5.173867404085594e-07, + "loss": 0.3208, + "step": 17940 + }, + { + "epoch": 2.1613485851896446, + "grad_norm": 4.809300899505615, + "learning_rate": 5.168616635643728e-07, + "loss": 0.3009, + "step": 17950 + }, + { + "epoch": 2.1625526791089706, + "grad_norm": 4.434291839599609, + "learning_rate": 5.163365681029224e-07, + "loss": 0.3118, + "step": 17960 + }, + { + "epoch": 2.1637567730282963, + "grad_norm": 3.94570255279541, + "learning_rate": 5.158114546039756e-07, + "loss": 0.3081, + "step": 17970 + }, + { + "epoch": 2.164960866947622, + "grad_norm": 4.972118854522705, + "learning_rate": 5.152863236473195e-07, + "loss": 0.3, + "step": 17980 + }, + { + "epoch": 2.1661649608669475, + "grad_norm": 5.422942161560059, + "learning_rate": 5.147611758127608e-07, + "loss": 0.3039, + "step": 17990 + }, + { + "epoch": 2.167369054786273, + "grad_norm": 4.45037317276001, + "learning_rate": 5.142360116801242e-07, + "loss": 0.3158, + "step": 18000 + }, + { + "epoch": 2.1685731487055993, + "grad_norm": 5.098633289337158, + "learning_rate": 5.137108318292533e-07, + "loss": 0.2949, + "step": 18010 + }, + { + "epoch": 2.169777242624925, + "grad_norm": 5.256601810455322, + "learning_rate": 5.131856368400082e-07, + "loss": 0.3037, + "step": 18020 + }, + { + "epoch": 2.1709813365442505, + "grad_norm": 5.189584732055664, + "learning_rate": 5.126604272922659e-07, + "loss": 0.3256, + "step": 18030 + }, + { + "epoch": 2.172185430463576, + "grad_norm": 4.259381294250488, + "learning_rate": 5.121352037659201e-07, + "loss": 0.3051, + "step": 18040 + }, + { + "epoch": 2.173389524382902, + "grad_norm": 4.795348644256592, + "learning_rate": 5.116099668408791e-07, + "loss": 0.3002, + "step": 18050 + }, + { + "epoch": 2.1745936183022274, + "grad_norm": 5.63735818862915, + "learning_rate": 5.110847170970665e-07, + "loss": 0.313, + "step": 18060 + }, + { + "epoch": 2.175797712221553, + "grad_norm": 6.581758975982666, + "learning_rate": 5.1055945511442e-07, + "loss": 0.3014, + "step": 18070 + }, + { + "epoch": 2.177001806140879, + "grad_norm": 5.026032447814941, + "learning_rate": 5.100341814728904e-07, + "loss": 0.3009, + "step": 18080 + }, + { + "epoch": 2.1782059000602048, + "grad_norm": 4.6837263107299805, + "learning_rate": 5.095088967524423e-07, + "loss": 0.3251, + "step": 18090 + }, + { + "epoch": 2.1794099939795304, + "grad_norm": 4.637839317321777, + "learning_rate": 5.089836015330513e-07, + "loss": 0.3177, + "step": 18100 + }, + { + "epoch": 2.180614087898856, + "grad_norm": 4.267435550689697, + "learning_rate": 5.084582963947057e-07, + "loss": 0.3003, + "step": 18110 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 4.481462001800537, + "learning_rate": 5.07932981917404e-07, + "loss": 0.3084, + "step": 18120 + }, + { + "epoch": 2.1830222757375077, + "grad_norm": 5.001600742340088, + "learning_rate": 5.074076586811554e-07, + "loss": 0.3117, + "step": 18130 + }, + { + "epoch": 2.1842263696568334, + "grad_norm": 4.785762310028076, + "learning_rate": 5.068823272659785e-07, + "loss": 0.3044, + "step": 18140 + }, + { + "epoch": 2.185430463576159, + "grad_norm": 4.241122245788574, + "learning_rate": 5.063569882519014e-07, + "loss": 0.3114, + "step": 18150 + }, + { + "epoch": 2.1866345574954846, + "grad_norm": 4.614393711090088, + "learning_rate": 5.0583164221896e-07, + "loss": 0.3143, + "step": 18160 + }, + { + "epoch": 2.1878386514148103, + "grad_norm": 5.790137767791748, + "learning_rate": 5.053062897471985e-07, + "loss": 0.3086, + "step": 18170 + }, + { + "epoch": 2.189042745334136, + "grad_norm": 5.027008056640625, + "learning_rate": 5.047809314166677e-07, + "loss": 0.2996, + "step": 18180 + }, + { + "epoch": 2.190246839253462, + "grad_norm": 4.725672245025635, + "learning_rate": 5.042555678074251e-07, + "loss": 0.3101, + "step": 18190 + }, + { + "epoch": 2.1914509331727876, + "grad_norm": 4.756001949310303, + "learning_rate": 5.037301994995342e-07, + "loss": 0.2892, + "step": 18200 + }, + { + "epoch": 2.1926550270921132, + "grad_norm": 3.9560751914978027, + "learning_rate": 5.032048270730634e-07, + "loss": 0.3118, + "step": 18210 + }, + { + "epoch": 2.193859121011439, + "grad_norm": 4.681294918060303, + "learning_rate": 5.026794511080859e-07, + "loss": 0.306, + "step": 18220 + }, + { + "epoch": 2.1950632149307645, + "grad_norm": 5.220909118652344, + "learning_rate": 5.021540721846787e-07, + "loss": 0.3089, + "step": 18230 + }, + { + "epoch": 2.19626730885009, + "grad_norm": 4.095883369445801, + "learning_rate": 5.016286908829218e-07, + "loss": 0.3179, + "step": 18240 + }, + { + "epoch": 2.197471402769416, + "grad_norm": 4.485768795013428, + "learning_rate": 5.011033077828982e-07, + "loss": 0.3037, + "step": 18250 + }, + { + "epoch": 2.198675496688742, + "grad_norm": 4.850970268249512, + "learning_rate": 5.00577923464693e-07, + "loss": 0.3098, + "step": 18260 + }, + { + "epoch": 2.1998795906080675, + "grad_norm": 4.3276848793029785, + "learning_rate": 5.000525385083919e-07, + "loss": 0.3117, + "step": 18270 + }, + { + "epoch": 2.201083684527393, + "grad_norm": 4.39775276184082, + "learning_rate": 4.995271534940823e-07, + "loss": 0.3185, + "step": 18280 + }, + { + "epoch": 2.2022877784467187, + "grad_norm": 4.972282409667969, + "learning_rate": 4.99001769001851e-07, + "loss": 0.3131, + "step": 18290 + }, + { + "epoch": 2.2034918723660444, + "grad_norm": 4.450355052947998, + "learning_rate": 4.984763856117842e-07, + "loss": 0.3052, + "step": 18300 + }, + { + "epoch": 2.2046959662853705, + "grad_norm": 4.771944046020508, + "learning_rate": 4.979510039039674e-07, + "loss": 0.3087, + "step": 18310 + }, + { + "epoch": 2.205900060204696, + "grad_norm": 4.077056407928467, + "learning_rate": 4.974256244584838e-07, + "loss": 0.2991, + "step": 18320 + }, + { + "epoch": 2.2071041541240217, + "grad_norm": 4.485861778259277, + "learning_rate": 4.969002478554139e-07, + "loss": 0.3117, + "step": 18330 + }, + { + "epoch": 2.2083082480433474, + "grad_norm": 4.26900053024292, + "learning_rate": 4.963748746748358e-07, + "loss": 0.299, + "step": 18340 + }, + { + "epoch": 2.209512341962673, + "grad_norm": 5.258630752563477, + "learning_rate": 4.958495054968235e-07, + "loss": 0.3109, + "step": 18350 + }, + { + "epoch": 2.2107164358819986, + "grad_norm": 5.4050774574279785, + "learning_rate": 4.953241409014459e-07, + "loss": 0.3263, + "step": 18360 + }, + { + "epoch": 2.2119205298013247, + "grad_norm": 4.431223392486572, + "learning_rate": 4.947987814687679e-07, + "loss": 0.3131, + "step": 18370 + }, + { + "epoch": 2.2131246237206503, + "grad_norm": 5.015274524688721, + "learning_rate": 4.942734277788481e-07, + "loss": 0.3122, + "step": 18380 + }, + { + "epoch": 2.214328717639976, + "grad_norm": 5.460362911224365, + "learning_rate": 4.937480804117392e-07, + "loss": 0.3049, + "step": 18390 + }, + { + "epoch": 2.2155328115593016, + "grad_norm": 4.469453811645508, + "learning_rate": 4.93222739947486e-07, + "loss": 0.3109, + "step": 18400 + }, + { + "epoch": 2.2167369054786272, + "grad_norm": 4.560921669006348, + "learning_rate": 4.926974069661265e-07, + "loss": 0.3155, + "step": 18410 + }, + { + "epoch": 2.217940999397953, + "grad_norm": 4.696376800537109, + "learning_rate": 4.921720820476904e-07, + "loss": 0.3256, + "step": 18420 + }, + { + "epoch": 2.219145093317279, + "grad_norm": 4.80272102355957, + "learning_rate": 4.916467657721984e-07, + "loss": 0.3172, + "step": 18430 + }, + { + "epoch": 2.2203491872366046, + "grad_norm": 4.686549663543701, + "learning_rate": 4.911214587196612e-07, + "loss": 0.3044, + "step": 18440 + }, + { + "epoch": 2.22155328115593, + "grad_norm": 4.5141921043396, + "learning_rate": 4.9059616147008e-07, + "loss": 0.296, + "step": 18450 + }, + { + "epoch": 2.222757375075256, + "grad_norm": 4.311396598815918, + "learning_rate": 4.900708746034446e-07, + "loss": 0.3052, + "step": 18460 + }, + { + "epoch": 2.2239614689945815, + "grad_norm": 4.644687175750732, + "learning_rate": 4.895455986997341e-07, + "loss": 0.3091, + "step": 18470 + }, + { + "epoch": 2.225165562913907, + "grad_norm": 4.708485126495361, + "learning_rate": 4.890203343389144e-07, + "loss": 0.3126, + "step": 18480 + }, + { + "epoch": 2.226369656833233, + "grad_norm": 4.648069381713867, + "learning_rate": 4.884950821009394e-07, + "loss": 0.3303, + "step": 18490 + }, + { + "epoch": 2.227573750752559, + "grad_norm": 5.3636555671691895, + "learning_rate": 4.8796984256575e-07, + "loss": 0.308, + "step": 18500 + }, + { + "epoch": 2.2287778446718844, + "grad_norm": 4.061014652252197, + "learning_rate": 4.874446163132719e-07, + "loss": 0.2957, + "step": 18510 + }, + { + "epoch": 2.22998193859121, + "grad_norm": 6.169346332550049, + "learning_rate": 4.869194039234169e-07, + "loss": 0.318, + "step": 18520 + }, + { + "epoch": 2.2311860325105357, + "grad_norm": 4.9474053382873535, + "learning_rate": 4.863942059760817e-07, + "loss": 0.3112, + "step": 18530 + }, + { + "epoch": 2.2323901264298613, + "grad_norm": 4.635356903076172, + "learning_rate": 4.858690230511465e-07, + "loss": 0.3006, + "step": 18540 + }, + { + "epoch": 2.2335942203491874, + "grad_norm": 4.872357368469238, + "learning_rate": 4.85343855728475e-07, + "loss": 0.315, + "step": 18550 + }, + { + "epoch": 2.234798314268513, + "grad_norm": 4.909818172454834, + "learning_rate": 4.848187045879141e-07, + "loss": 0.2983, + "step": 18560 + }, + { + "epoch": 2.2360024081878387, + "grad_norm": 5.507841110229492, + "learning_rate": 4.842935702092923e-07, + "loss": 0.2919, + "step": 18570 + }, + { + "epoch": 2.2372065021071643, + "grad_norm": 4.438649654388428, + "learning_rate": 4.837684531724202e-07, + "loss": 0.3012, + "step": 18580 + }, + { + "epoch": 2.23841059602649, + "grad_norm": 4.70427942276001, + "learning_rate": 4.832433540570885e-07, + "loss": 0.3076, + "step": 18590 + }, + { + "epoch": 2.2396146899458156, + "grad_norm": 4.81848669052124, + "learning_rate": 4.827182734430687e-07, + "loss": 0.3021, + "step": 18600 + }, + { + "epoch": 2.2408187838651417, + "grad_norm": 4.911860466003418, + "learning_rate": 4.821932119101116e-07, + "loss": 0.3109, + "step": 18610 + }, + { + "epoch": 2.2420228777844673, + "grad_norm": 5.092623233795166, + "learning_rate": 4.816681700379472e-07, + "loss": 0.3243, + "step": 18620 + }, + { + "epoch": 2.243226971703793, + "grad_norm": 4.224728584289551, + "learning_rate": 4.811431484062832e-07, + "loss": 0.3128, + "step": 18630 + }, + { + "epoch": 2.2444310656231186, + "grad_norm": 4.93331241607666, + "learning_rate": 4.806181475948057e-07, + "loss": 0.3147, + "step": 18640 + }, + { + "epoch": 2.245635159542444, + "grad_norm": 6.220354080200195, + "learning_rate": 4.800931681831773e-07, + "loss": 0.2964, + "step": 18650 + }, + { + "epoch": 2.24683925346177, + "grad_norm": 5.004923343658447, + "learning_rate": 4.795682107510375e-07, + "loss": 0.3172, + "step": 18660 + }, + { + "epoch": 2.248043347381096, + "grad_norm": 5.164400577545166, + "learning_rate": 4.790432758780005e-07, + "loss": 0.3063, + "step": 18670 + }, + { + "epoch": 2.2492474413004215, + "grad_norm": 5.098756313323975, + "learning_rate": 4.785183641436569e-07, + "loss": 0.3045, + "step": 18680 + }, + { + "epoch": 2.250451535219747, + "grad_norm": 4.363048553466797, + "learning_rate": 4.779934761275706e-07, + "loss": 0.3084, + "step": 18690 + }, + { + "epoch": 2.251655629139073, + "grad_norm": 5.233163833618164, + "learning_rate": 4.774686124092804e-07, + "loss": 0.316, + "step": 18700 + }, + { + "epoch": 2.2528597230583984, + "grad_norm": 4.870039463043213, + "learning_rate": 4.769437735682972e-07, + "loss": 0.3008, + "step": 18710 + }, + { + "epoch": 2.254063816977724, + "grad_norm": 5.44446325302124, + "learning_rate": 4.7641896018410506e-07, + "loss": 0.3139, + "step": 18720 + }, + { + "epoch": 2.25526791089705, + "grad_norm": 4.950879096984863, + "learning_rate": 4.758941728361599e-07, + "loss": 0.3108, + "step": 18730 + }, + { + "epoch": 2.2564720048163758, + "grad_norm": 4.887548446655273, + "learning_rate": 4.7536941210388895e-07, + "loss": 0.3195, + "step": 18740 + }, + { + "epoch": 2.2576760987357014, + "grad_norm": 6.180630207061768, + "learning_rate": 4.7484467856668946e-07, + "loss": 0.3112, + "step": 18750 + }, + { + "epoch": 2.258880192655027, + "grad_norm": 5.481302738189697, + "learning_rate": 4.743199728039294e-07, + "loss": 0.3124, + "step": 18760 + }, + { + "epoch": 2.2600842865743527, + "grad_norm": 4.6261677742004395, + "learning_rate": 4.737952953949457e-07, + "loss": 0.3058, + "step": 18770 + }, + { + "epoch": 2.2612883804936788, + "grad_norm": 4.097585201263428, + "learning_rate": 4.732706469190442e-07, + "loss": 0.3271, + "step": 18780 + }, + { + "epoch": 2.2624924744130044, + "grad_norm": 5.000282287597656, + "learning_rate": 4.7274602795549836e-07, + "loss": 0.317, + "step": 18790 + }, + { + "epoch": 2.26369656833233, + "grad_norm": 4.3350958824157715, + "learning_rate": 4.7222143908354943e-07, + "loss": 0.3083, + "step": 18800 + }, + { + "epoch": 2.2649006622516556, + "grad_norm": 4.336573123931885, + "learning_rate": 4.7169688088240555e-07, + "loss": 0.3139, + "step": 18810 + }, + { + "epoch": 2.2661047561709813, + "grad_norm": 4.1952900886535645, + "learning_rate": 4.7117235393124064e-07, + "loss": 0.294, + "step": 18820 + }, + { + "epoch": 2.267308850090307, + "grad_norm": 5.418072700500488, + "learning_rate": 4.7064785880919414e-07, + "loss": 0.3185, + "step": 18830 + }, + { + "epoch": 2.2685129440096325, + "grad_norm": 5.001430511474609, + "learning_rate": 4.701233960953708e-07, + "loss": 0.3108, + "step": 18840 + }, + { + "epoch": 2.2697170379289586, + "grad_norm": 5.28980827331543, + "learning_rate": 4.69598966368839e-07, + "loss": 0.3149, + "step": 18850 + }, + { + "epoch": 2.2709211318482843, + "grad_norm": 5.221833229064941, + "learning_rate": 4.6907457020863095e-07, + "loss": 0.3106, + "step": 18860 + }, + { + "epoch": 2.27212522576761, + "grad_norm": 4.259886264801025, + "learning_rate": 4.6855020819374196e-07, + "loss": 0.3159, + "step": 18870 + }, + { + "epoch": 2.2733293196869355, + "grad_norm": 5.210353851318359, + "learning_rate": 4.680258809031293e-07, + "loss": 0.306, + "step": 18880 + }, + { + "epoch": 2.274533413606261, + "grad_norm": 4.933556079864502, + "learning_rate": 4.6750158891571246e-07, + "loss": 0.2988, + "step": 18890 + }, + { + "epoch": 2.2757375075255872, + "grad_norm": 5.060166358947754, + "learning_rate": 4.669773328103712e-07, + "loss": 0.3298, + "step": 18900 + }, + { + "epoch": 2.276941601444913, + "grad_norm": 5.316260814666748, + "learning_rate": 4.664531131659461e-07, + "loss": 0.3193, + "step": 18910 + }, + { + "epoch": 2.2781456953642385, + "grad_norm": 4.371904373168945, + "learning_rate": 4.659289305612375e-07, + "loss": 0.3181, + "step": 18920 + }, + { + "epoch": 2.279349789283564, + "grad_norm": 4.114840984344482, + "learning_rate": 4.65404785575005e-07, + "loss": 0.3089, + "step": 18930 + }, + { + "epoch": 2.2805538832028898, + "grad_norm": 4.94135046005249, + "learning_rate": 4.64880678785966e-07, + "loss": 0.3158, + "step": 18940 + }, + { + "epoch": 2.2817579771222154, + "grad_norm": 5.033153057098389, + "learning_rate": 4.6435661077279633e-07, + "loss": 0.3087, + "step": 18950 + }, + { + "epoch": 2.282962071041541, + "grad_norm": 4.434708595275879, + "learning_rate": 4.638325821141289e-07, + "loss": 0.3031, + "step": 18960 + }, + { + "epoch": 2.284166164960867, + "grad_norm": 4.674195766448975, + "learning_rate": 4.6330859338855325e-07, + "loss": 0.3227, + "step": 18970 + }, + { + "epoch": 2.2853702588801927, + "grad_norm": 4.624505043029785, + "learning_rate": 4.6278464517461434e-07, + "loss": 0.2994, + "step": 18980 + }, + { + "epoch": 2.2865743527995184, + "grad_norm": 4.435290336608887, + "learning_rate": 4.622607380508129e-07, + "loss": 0.3125, + "step": 18990 + }, + { + "epoch": 2.287778446718844, + "grad_norm": 4.538943767547607, + "learning_rate": 4.6173687259560417e-07, + "loss": 0.3166, + "step": 19000 + }, + { + "epoch": 2.2889825406381696, + "grad_norm": 5.1769890785217285, + "learning_rate": 4.6121304938739754e-07, + "loss": 0.2978, + "step": 19010 + }, + { + "epoch": 2.2901866345574957, + "grad_norm": 4.897463321685791, + "learning_rate": 4.606892690045551e-07, + "loss": 0.2857, + "step": 19020 + }, + { + "epoch": 2.2913907284768213, + "grad_norm": 5.332199573516846, + "learning_rate": 4.601655320253924e-07, + "loss": 0.3082, + "step": 19030 + }, + { + "epoch": 2.292594822396147, + "grad_norm": 4.842720985412598, + "learning_rate": 4.5964183902817677e-07, + "loss": 0.3003, + "step": 19040 + }, + { + "epoch": 2.2937989163154726, + "grad_norm": 4.277060031890869, + "learning_rate": 4.5911819059112724e-07, + "loss": 0.3027, + "step": 19050 + }, + { + "epoch": 2.2950030102347982, + "grad_norm": 4.499503135681152, + "learning_rate": 4.5859458729241287e-07, + "loss": 0.311, + "step": 19060 + }, + { + "epoch": 2.296207104154124, + "grad_norm": 5.2861762046813965, + "learning_rate": 4.580710297101537e-07, + "loss": 0.3197, + "step": 19070 + }, + { + "epoch": 2.2974111980734495, + "grad_norm": 4.3773112297058105, + "learning_rate": 4.5754751842241905e-07, + "loss": 0.3113, + "step": 19080 + }, + { + "epoch": 2.2986152919927756, + "grad_norm": 4.447787284851074, + "learning_rate": 4.5702405400722703e-07, + "loss": 0.3037, + "step": 19090 + }, + { + "epoch": 2.299819385912101, + "grad_norm": 5.014771938323975, + "learning_rate": 4.5650063704254395e-07, + "loss": 0.3018, + "step": 19100 + }, + { + "epoch": 2.301023479831427, + "grad_norm": 4.333285331726074, + "learning_rate": 4.55977268106284e-07, + "loss": 0.3176, + "step": 19110 + }, + { + "epoch": 2.3022275737507525, + "grad_norm": 6.291433334350586, + "learning_rate": 4.5545394777630786e-07, + "loss": 0.3335, + "step": 19120 + }, + { + "epoch": 2.303431667670078, + "grad_norm": 4.657562255859375, + "learning_rate": 4.5493067663042325e-07, + "loss": 0.3059, + "step": 19130 + }, + { + "epoch": 2.304635761589404, + "grad_norm": 4.472227573394775, + "learning_rate": 4.544074552463829e-07, + "loss": 0.3074, + "step": 19140 + }, + { + "epoch": 2.30583985550873, + "grad_norm": 5.011964797973633, + "learning_rate": 4.5388428420188486e-07, + "loss": 0.3036, + "step": 19150 + }, + { + "epoch": 2.3070439494280555, + "grad_norm": 5.620879173278809, + "learning_rate": 4.533611640745718e-07, + "loss": 0.31, + "step": 19160 + }, + { + "epoch": 2.308248043347381, + "grad_norm": 5.25240421295166, + "learning_rate": 4.5283809544202996e-07, + "loss": 0.328, + "step": 19170 + }, + { + "epoch": 2.3094521372667067, + "grad_norm": 4.3917317390441895, + "learning_rate": 4.5231507888178856e-07, + "loss": 0.3129, + "step": 19180 + }, + { + "epoch": 2.3106562311860324, + "grad_norm": 4.568994998931885, + "learning_rate": 4.517921149713196e-07, + "loss": 0.3057, + "step": 19190 + }, + { + "epoch": 2.311860325105358, + "grad_norm": 4.5026726722717285, + "learning_rate": 4.512692042880372e-07, + "loss": 0.2997, + "step": 19200 + }, + { + "epoch": 2.313064419024684, + "grad_norm": 3.986133098602295, + "learning_rate": 4.507463474092959e-07, + "loss": 0.2952, + "step": 19210 + }, + { + "epoch": 2.3142685129440097, + "grad_norm": 4.367317199707031, + "learning_rate": 4.5022354491239145e-07, + "loss": 0.3036, + "step": 19220 + }, + { + "epoch": 2.3154726068633353, + "grad_norm": 5.649072170257568, + "learning_rate": 4.497007973745595e-07, + "loss": 0.3173, + "step": 19230 + }, + { + "epoch": 2.316676700782661, + "grad_norm": 5.655643463134766, + "learning_rate": 4.4917810537297514e-07, + "loss": 0.327, + "step": 19240 + }, + { + "epoch": 2.3178807947019866, + "grad_norm": 5.137732982635498, + "learning_rate": 4.4865546948475147e-07, + "loss": 0.3065, + "step": 19250 + }, + { + "epoch": 2.3190848886213127, + "grad_norm": 4.715443134307861, + "learning_rate": 4.481328902869404e-07, + "loss": 0.3207, + "step": 19260 + }, + { + "epoch": 2.3202889825406383, + "grad_norm": 3.9082722663879395, + "learning_rate": 4.476103683565308e-07, + "loss": 0.3074, + "step": 19270 + }, + { + "epoch": 2.321493076459964, + "grad_norm": 4.448252201080322, + "learning_rate": 4.4708790427044887e-07, + "loss": 0.3063, + "step": 19280 + }, + { + "epoch": 2.3226971703792896, + "grad_norm": 4.547604560852051, + "learning_rate": 4.465654986055559e-07, + "loss": 0.3098, + "step": 19290 + }, + { + "epoch": 2.323901264298615, + "grad_norm": 5.669996738433838, + "learning_rate": 4.460431519386497e-07, + "loss": 0.3188, + "step": 19300 + }, + { + "epoch": 2.325105358217941, + "grad_norm": 5.271092891693115, + "learning_rate": 4.4552086484646246e-07, + "loss": 0.2948, + "step": 19310 + }, + { + "epoch": 2.3263094521372665, + "grad_norm": 5.6719231605529785, + "learning_rate": 4.4499863790566087e-07, + "loss": 0.3089, + "step": 19320 + }, + { + "epoch": 2.3275135460565926, + "grad_norm": 5.9080657958984375, + "learning_rate": 4.444764716928447e-07, + "loss": 0.3195, + "step": 19330 + }, + { + "epoch": 2.328717639975918, + "grad_norm": 5.201897144317627, + "learning_rate": 4.43954366784547e-07, + "loss": 0.2979, + "step": 19340 + }, + { + "epoch": 2.329921733895244, + "grad_norm": 4.319961071014404, + "learning_rate": 4.4343232375723343e-07, + "loss": 0.3059, + "step": 19350 + }, + { + "epoch": 2.3311258278145695, + "grad_norm": 4.492523670196533, + "learning_rate": 4.4291034318730086e-07, + "loss": 0.2941, + "step": 19360 + }, + { + "epoch": 2.332329921733895, + "grad_norm": 5.589833736419678, + "learning_rate": 4.4238842565107715e-07, + "loss": 0.3089, + "step": 19370 + }, + { + "epoch": 2.333534015653221, + "grad_norm": 4.234698295593262, + "learning_rate": 4.4186657172482105e-07, + "loss": 0.3012, + "step": 19380 + }, + { + "epoch": 2.334738109572547, + "grad_norm": 4.777867317199707, + "learning_rate": 4.413447819847206e-07, + "loss": 0.3083, + "step": 19390 + }, + { + "epoch": 2.3359422034918724, + "grad_norm": 5.0551533699035645, + "learning_rate": 4.4082305700689334e-07, + "loss": 0.3056, + "step": 19400 + }, + { + "epoch": 2.337146297411198, + "grad_norm": 4.407803535461426, + "learning_rate": 4.40301397367385e-07, + "loss": 0.3137, + "step": 19410 + }, + { + "epoch": 2.3383503913305237, + "grad_norm": 4.408458709716797, + "learning_rate": 4.3977980364216925e-07, + "loss": 0.3234, + "step": 19420 + }, + { + "epoch": 2.3395544852498493, + "grad_norm": 5.100025653839111, + "learning_rate": 4.392582764071471e-07, + "loss": 0.3053, + "step": 19430 + }, + { + "epoch": 2.340758579169175, + "grad_norm": 4.870809078216553, + "learning_rate": 4.3873681623814634e-07, + "loss": 0.2973, + "step": 19440 + }, + { + "epoch": 2.341962673088501, + "grad_norm": 5.078246116638184, + "learning_rate": 4.3821542371092e-07, + "loss": 0.3042, + "step": 19450 + }, + { + "epoch": 2.3431667670078267, + "grad_norm": 4.400288105010986, + "learning_rate": 4.3769409940114706e-07, + "loss": 0.3012, + "step": 19460 + }, + { + "epoch": 2.3443708609271523, + "grad_norm": 5.289750576019287, + "learning_rate": 4.3717284388443123e-07, + "loss": 0.3149, + "step": 19470 + }, + { + "epoch": 2.345574954846478, + "grad_norm": 4.133148670196533, + "learning_rate": 4.3665165773629955e-07, + "loss": 0.311, + "step": 19480 + }, + { + "epoch": 2.3467790487658036, + "grad_norm": 4.689704418182373, + "learning_rate": 4.361305415322032e-07, + "loss": 0.2985, + "step": 19490 + }, + { + "epoch": 2.3479831426851296, + "grad_norm": 5.3425822257995605, + "learning_rate": 4.35609495847516e-07, + "loss": 0.3252, + "step": 19500 + }, + { + "epoch": 2.3491872366044553, + "grad_norm": 4.8020524978637695, + "learning_rate": 4.350885212575338e-07, + "loss": 0.3017, + "step": 19510 + }, + { + "epoch": 2.350391330523781, + "grad_norm": 3.823481798171997, + "learning_rate": 4.345676183374737e-07, + "loss": 0.3163, + "step": 19520 + }, + { + "epoch": 2.3515954244431065, + "grad_norm": 5.067866802215576, + "learning_rate": 4.3404678766247393e-07, + "loss": 0.2985, + "step": 19530 + }, + { + "epoch": 2.352799518362432, + "grad_norm": 4.470125198364258, + "learning_rate": 4.335260298075931e-07, + "loss": 0.3215, + "step": 19540 + }, + { + "epoch": 2.354003612281758, + "grad_norm": 4.854072093963623, + "learning_rate": 4.330053453478094e-07, + "loss": 0.3139, + "step": 19550 + }, + { + "epoch": 2.3552077062010834, + "grad_norm": 4.061732292175293, + "learning_rate": 4.3248473485801943e-07, + "loss": 0.2944, + "step": 19560 + }, + { + "epoch": 2.3564118001204095, + "grad_norm": 4.881399154663086, + "learning_rate": 4.319641989130387e-07, + "loss": 0.2958, + "step": 19570 + }, + { + "epoch": 2.357615894039735, + "grad_norm": 4.650146007537842, + "learning_rate": 4.3144373808760026e-07, + "loss": 0.3092, + "step": 19580 + }, + { + "epoch": 2.358819987959061, + "grad_norm": 5.014580249786377, + "learning_rate": 4.3092335295635444e-07, + "loss": 0.3143, + "step": 19590 + }, + { + "epoch": 2.3600240818783864, + "grad_norm": 5.064713478088379, + "learning_rate": 4.304030440938673e-07, + "loss": 0.3106, + "step": 19600 + }, + { + "epoch": 2.361228175797712, + "grad_norm": 4.044290065765381, + "learning_rate": 4.298828120746213e-07, + "loss": 0.3024, + "step": 19610 + }, + { + "epoch": 2.362432269717038, + "grad_norm": 5.447383403778076, + "learning_rate": 4.29362657473014e-07, + "loss": 0.3147, + "step": 19620 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 4.447105884552002, + "learning_rate": 4.2884258086335745e-07, + "loss": 0.303, + "step": 19630 + }, + { + "epoch": 2.3648404575556894, + "grad_norm": 4.2513957023620605, + "learning_rate": 4.2832258281987724e-07, + "loss": 0.3107, + "step": 19640 + }, + { + "epoch": 2.366044551475015, + "grad_norm": 5.619822025299072, + "learning_rate": 4.2780266391671277e-07, + "loss": 0.3212, + "step": 19650 + }, + { + "epoch": 2.3672486453943407, + "grad_norm": 5.056023597717285, + "learning_rate": 4.272828247279155e-07, + "loss": 0.298, + "step": 19660 + }, + { + "epoch": 2.3684527393136663, + "grad_norm": 4.584505558013916, + "learning_rate": 4.267630658274495e-07, + "loss": 0.3069, + "step": 19670 + }, + { + "epoch": 2.3696568332329924, + "grad_norm": 5.227287292480469, + "learning_rate": 4.2624338778918964e-07, + "loss": 0.296, + "step": 19680 + }, + { + "epoch": 2.370860927152318, + "grad_norm": 4.425261974334717, + "learning_rate": 4.2572379118692155e-07, + "loss": 0.3093, + "step": 19690 + }, + { + "epoch": 2.3720650210716436, + "grad_norm": 4.10771369934082, + "learning_rate": 4.2520427659434134e-07, + "loss": 0.295, + "step": 19700 + }, + { + "epoch": 2.3732691149909693, + "grad_norm": 4.561648845672607, + "learning_rate": 4.2468484458505456e-07, + "loss": 0.3006, + "step": 19710 + }, + { + "epoch": 2.374473208910295, + "grad_norm": 3.9050345420837402, + "learning_rate": 4.241654957325748e-07, + "loss": 0.3016, + "step": 19720 + }, + { + "epoch": 2.3756773028296205, + "grad_norm": 5.106329917907715, + "learning_rate": 4.2364623061032477e-07, + "loss": 0.3043, + "step": 19730 + }, + { + "epoch": 2.3768813967489466, + "grad_norm": 6.0447211265563965, + "learning_rate": 4.231270497916343e-07, + "loss": 0.3114, + "step": 19740 + }, + { + "epoch": 2.3780854906682722, + "grad_norm": 4.171956539154053, + "learning_rate": 4.2260795384974037e-07, + "loss": 0.3033, + "step": 19750 + }, + { + "epoch": 2.379289584587598, + "grad_norm": 4.500546932220459, + "learning_rate": 4.2208894335778573e-07, + "loss": 0.3066, + "step": 19760 + }, + { + "epoch": 2.3804936785069235, + "grad_norm": 5.30014181137085, + "learning_rate": 4.215700188888192e-07, + "loss": 0.3008, + "step": 19770 + }, + { + "epoch": 2.381697772426249, + "grad_norm": 4.23181676864624, + "learning_rate": 4.2105118101579497e-07, + "loss": 0.2925, + "step": 19780 + }, + { + "epoch": 2.3829018663455748, + "grad_norm": 4.446700096130371, + "learning_rate": 4.205324303115706e-07, + "loss": 0.3142, + "step": 19790 + }, + { + "epoch": 2.384105960264901, + "grad_norm": 5.344078063964844, + "learning_rate": 4.2001376734890824e-07, + "loss": 0.3053, + "step": 19800 + }, + { + "epoch": 2.3853100541842265, + "grad_norm": 5.066955089569092, + "learning_rate": 4.1949519270047295e-07, + "loss": 0.3071, + "step": 19810 + }, + { + "epoch": 2.386514148103552, + "grad_norm": 4.834653377532959, + "learning_rate": 4.1897670693883255e-07, + "loss": 0.3039, + "step": 19820 + }, + { + "epoch": 2.3877182420228777, + "grad_norm": 4.982695579528809, + "learning_rate": 4.1845831063645586e-07, + "loss": 0.3007, + "step": 19830 + }, + { + "epoch": 2.3889223359422034, + "grad_norm": 5.261125564575195, + "learning_rate": 4.1794000436571374e-07, + "loss": 0.3121, + "step": 19840 + }, + { + "epoch": 2.390126429861529, + "grad_norm": 5.1389570236206055, + "learning_rate": 4.174217886988775e-07, + "loss": 0.3058, + "step": 19850 + }, + { + "epoch": 2.391330523780855, + "grad_norm": 4.307366371154785, + "learning_rate": 4.169036642081183e-07, + "loss": 0.3008, + "step": 19860 + }, + { + "epoch": 2.3925346177001807, + "grad_norm": 5.068446636199951, + "learning_rate": 4.163856314655064e-07, + "loss": 0.3145, + "step": 19870 + }, + { + "epoch": 2.3937387116195064, + "grad_norm": 5.377712249755859, + "learning_rate": 4.1586769104301124e-07, + "loss": 0.3047, + "step": 19880 + }, + { + "epoch": 2.394942805538832, + "grad_norm": 5.161853313446045, + "learning_rate": 4.153498435124999e-07, + "loss": 0.3111, + "step": 19890 + }, + { + "epoch": 2.3961468994581576, + "grad_norm": 4.217031002044678, + "learning_rate": 4.1483208944573745e-07, + "loss": 0.2886, + "step": 19900 + }, + { + "epoch": 2.3973509933774833, + "grad_norm": 4.948873996734619, + "learning_rate": 4.1431442941438486e-07, + "loss": 0.3138, + "step": 19910 + }, + { + "epoch": 2.3985550872968093, + "grad_norm": 5.304249286651611, + "learning_rate": 4.1379686399000016e-07, + "loss": 0.3013, + "step": 19920 + }, + { + "epoch": 2.399759181216135, + "grad_norm": 5.372039318084717, + "learning_rate": 4.132793937440365e-07, + "loss": 0.316, + "step": 19930 + }, + { + "epoch": 2.4009632751354606, + "grad_norm": 5.1526265144348145, + "learning_rate": 4.127620192478421e-07, + "loss": 0.3177, + "step": 19940 + }, + { + "epoch": 2.4021673690547862, + "grad_norm": 4.650707244873047, + "learning_rate": 4.122447410726591e-07, + "loss": 0.3014, + "step": 19950 + }, + { + "epoch": 2.403371462974112, + "grad_norm": 4.576737403869629, + "learning_rate": 4.1172755978962395e-07, + "loss": 0.3069, + "step": 19960 + }, + { + "epoch": 2.4045755568934375, + "grad_norm": 5.201079845428467, + "learning_rate": 4.1121047596976534e-07, + "loss": 0.3151, + "step": 19970 + }, + { + "epoch": 2.4057796508127636, + "grad_norm": 4.859030723571777, + "learning_rate": 4.1069349018400503e-07, + "loss": 0.298, + "step": 19980 + }, + { + "epoch": 2.406983744732089, + "grad_norm": 5.44400691986084, + "learning_rate": 4.101766030031562e-07, + "loss": 0.303, + "step": 19990 + }, + { + "epoch": 2.408187838651415, + "grad_norm": 4.533078193664551, + "learning_rate": 4.0965981499792307e-07, + "loss": 0.3055, + "step": 20000 + }, + { + "epoch": 2.4093919325707405, + "grad_norm": 5.147141456604004, + "learning_rate": 4.0914312673890054e-07, + "loss": 0.3141, + "step": 20010 + }, + { + "epoch": 2.410596026490066, + "grad_norm": 4.530623912811279, + "learning_rate": 4.0862653879657373e-07, + "loss": 0.3205, + "step": 20020 + }, + { + "epoch": 2.411800120409392, + "grad_norm": 4.804474830627441, + "learning_rate": 4.08110051741316e-07, + "loss": 0.3113, + "step": 20030 + }, + { + "epoch": 2.413004214328718, + "grad_norm": 4.642183780670166, + "learning_rate": 4.0759366614339015e-07, + "loss": 0.3115, + "step": 20040 + }, + { + "epoch": 2.4142083082480434, + "grad_norm": 4.975921630859375, + "learning_rate": 4.0707738257294685e-07, + "loss": 0.3165, + "step": 20050 + }, + { + "epoch": 2.415412402167369, + "grad_norm": 4.621540546417236, + "learning_rate": 4.065612016000241e-07, + "loss": 0.2914, + "step": 20060 + }, + { + "epoch": 2.4166164960866947, + "grad_norm": 4.194451808929443, + "learning_rate": 4.060451237945462e-07, + "loss": 0.3035, + "step": 20070 + }, + { + "epoch": 2.4178205900060203, + "grad_norm": 4.82729959487915, + "learning_rate": 4.05529149726324e-07, + "loss": 0.3068, + "step": 20080 + }, + { + "epoch": 2.419024683925346, + "grad_norm": 5.17459774017334, + "learning_rate": 4.050132799650538e-07, + "loss": 0.3092, + "step": 20090 + }, + { + "epoch": 2.420228777844672, + "grad_norm": 5.787187576293945, + "learning_rate": 4.0449751508031666e-07, + "loss": 0.3168, + "step": 20100 + }, + { + "epoch": 2.4214328717639977, + "grad_norm": 4.466209411621094, + "learning_rate": 4.039818556415775e-07, + "loss": 0.296, + "step": 20110 + }, + { + "epoch": 2.4226369656833233, + "grad_norm": 4.929852485656738, + "learning_rate": 4.034663022181852e-07, + "loss": 0.3135, + "step": 20120 + }, + { + "epoch": 2.423841059602649, + "grad_norm": 4.523739337921143, + "learning_rate": 4.029508553793718e-07, + "loss": 0.288, + "step": 20130 + }, + { + "epoch": 2.4250451535219746, + "grad_norm": 7.000367641448975, + "learning_rate": 4.0243551569425095e-07, + "loss": 0.3105, + "step": 20140 + }, + { + "epoch": 2.4262492474413007, + "grad_norm": 6.229575157165527, + "learning_rate": 4.019202837318185e-07, + "loss": 0.3166, + "step": 20150 + }, + { + "epoch": 2.4274533413606263, + "grad_norm": 5.243337154388428, + "learning_rate": 4.0140516006095134e-07, + "loss": 0.3046, + "step": 20160 + }, + { + "epoch": 2.428657435279952, + "grad_norm": 4.598159313201904, + "learning_rate": 4.0089014525040685e-07, + "loss": 0.3064, + "step": 20170 + }, + { + "epoch": 2.4298615291992776, + "grad_norm": 4.482394695281982, + "learning_rate": 4.003752398688218e-07, + "loss": 0.3097, + "step": 20180 + }, + { + "epoch": 2.431065623118603, + "grad_norm": 5.39198637008667, + "learning_rate": 3.9986044448471244e-07, + "loss": 0.3112, + "step": 20190 + }, + { + "epoch": 2.432269717037929, + "grad_norm": 4.356963634490967, + "learning_rate": 3.9934575966647375e-07, + "loss": 0.3006, + "step": 20200 + }, + { + "epoch": 2.4334738109572545, + "grad_norm": 4.211975574493408, + "learning_rate": 3.9883118598237837e-07, + "loss": 0.2989, + "step": 20210 + }, + { + "epoch": 2.4346779048765805, + "grad_norm": 5.301422119140625, + "learning_rate": 3.9831672400057605e-07, + "loss": 0.3178, + "step": 20220 + }, + { + "epoch": 2.435881998795906, + "grad_norm": 4.181766510009766, + "learning_rate": 3.978023742890937e-07, + "loss": 0.3066, + "step": 20230 + }, + { + "epoch": 2.437086092715232, + "grad_norm": 5.18208122253418, + "learning_rate": 3.9728813741583383e-07, + "loss": 0.3001, + "step": 20240 + }, + { + "epoch": 2.4382901866345574, + "grad_norm": 5.382752418518066, + "learning_rate": 3.967740139485748e-07, + "loss": 0.3088, + "step": 20250 + }, + { + "epoch": 2.439494280553883, + "grad_norm": 5.215182304382324, + "learning_rate": 3.9626000445496934e-07, + "loss": 0.2882, + "step": 20260 + }, + { + "epoch": 2.440698374473209, + "grad_norm": 5.133399963378906, + "learning_rate": 3.957461095025444e-07, + "loss": 0.3303, + "step": 20270 + }, + { + "epoch": 2.4419024683925348, + "grad_norm": 5.194669246673584, + "learning_rate": 3.952323296587007e-07, + "loss": 0.3172, + "step": 20280 + }, + { + "epoch": 2.4431065623118604, + "grad_norm": 4.95144510269165, + "learning_rate": 3.947186654907119e-07, + "loss": 0.3138, + "step": 20290 + }, + { + "epoch": 2.444310656231186, + "grad_norm": 5.0588812828063965, + "learning_rate": 3.9420511756572346e-07, + "loss": 0.3058, + "step": 20300 + }, + { + "epoch": 2.4455147501505117, + "grad_norm": 5.033606052398682, + "learning_rate": 3.936916864507529e-07, + "loss": 0.3161, + "step": 20310 + }, + { + "epoch": 2.4467188440698373, + "grad_norm": 5.006187915802002, + "learning_rate": 3.9317837271268876e-07, + "loss": 0.2993, + "step": 20320 + }, + { + "epoch": 2.447922937989163, + "grad_norm": 4.955638408660889, + "learning_rate": 3.926651769182901e-07, + "loss": 0.3023, + "step": 20330 + }, + { + "epoch": 2.449127031908489, + "grad_norm": 4.786928653717041, + "learning_rate": 3.9215209963418513e-07, + "loss": 0.3207, + "step": 20340 + }, + { + "epoch": 2.4503311258278146, + "grad_norm": 4.456767559051514, + "learning_rate": 3.9163914142687177e-07, + "loss": 0.3142, + "step": 20350 + }, + { + "epoch": 2.4515352197471403, + "grad_norm": 5.671106338500977, + "learning_rate": 3.911263028627164e-07, + "loss": 0.3125, + "step": 20360 + }, + { + "epoch": 2.452739313666466, + "grad_norm": 5.525556564331055, + "learning_rate": 3.9061358450795344e-07, + "loss": 0.2972, + "step": 20370 + }, + { + "epoch": 2.4539434075857915, + "grad_norm": 4.18988561630249, + "learning_rate": 3.9010098692868397e-07, + "loss": 0.2971, + "step": 20380 + }, + { + "epoch": 2.4551475015051176, + "grad_norm": 5.705048561096191, + "learning_rate": 3.895885106908763e-07, + "loss": 0.3094, + "step": 20390 + }, + { + "epoch": 2.4563515954244433, + "grad_norm": 5.453742980957031, + "learning_rate": 3.890761563603647e-07, + "loss": 0.3079, + "step": 20400 + }, + { + "epoch": 2.457555689343769, + "grad_norm": 4.007357120513916, + "learning_rate": 3.885639245028488e-07, + "loss": 0.3119, + "step": 20410 + }, + { + "epoch": 2.4587597832630945, + "grad_norm": 5.247729301452637, + "learning_rate": 3.8805181568389255e-07, + "loss": 0.3047, + "step": 20420 + }, + { + "epoch": 2.45996387718242, + "grad_norm": 4.143746852874756, + "learning_rate": 3.8753983046892465e-07, + "loss": 0.3062, + "step": 20430 + }, + { + "epoch": 2.461167971101746, + "grad_norm": 4.356471538543701, + "learning_rate": 3.8702796942323736e-07, + "loss": 0.3095, + "step": 20440 + }, + { + "epoch": 2.4623720650210714, + "grad_norm": 4.553625106811523, + "learning_rate": 3.8651623311198516e-07, + "loss": 0.3117, + "step": 20450 + }, + { + "epoch": 2.4635761589403975, + "grad_norm": 4.882122039794922, + "learning_rate": 3.860046221001855e-07, + "loss": 0.322, + "step": 20460 + }, + { + "epoch": 2.464780252859723, + "grad_norm": 5.218991756439209, + "learning_rate": 3.854931369527172e-07, + "loss": 0.3138, + "step": 20470 + }, + { + "epoch": 2.4659843467790488, + "grad_norm": 5.427024841308594, + "learning_rate": 3.849817782343201e-07, + "loss": 0.3125, + "step": 20480 + }, + { + "epoch": 2.4671884406983744, + "grad_norm": 4.729675769805908, + "learning_rate": 3.8447054650959447e-07, + "loss": 0.2925, + "step": 20490 + }, + { + "epoch": 2.4683925346177, + "grad_norm": 5.330557346343994, + "learning_rate": 3.8395944234300053e-07, + "loss": 0.2968, + "step": 20500 + }, + { + "epoch": 2.469596628537026, + "grad_norm": 4.960201740264893, + "learning_rate": 3.834484662988573e-07, + "loss": 0.3147, + "step": 20510 + }, + { + "epoch": 2.4708007224563517, + "grad_norm": 4.888551235198975, + "learning_rate": 3.829376189413427e-07, + "loss": 0.3098, + "step": 20520 + }, + { + "epoch": 2.4720048163756774, + "grad_norm": 4.717561721801758, + "learning_rate": 3.824269008344924e-07, + "loss": 0.3018, + "step": 20530 + }, + { + "epoch": 2.473208910295003, + "grad_norm": 4.666635990142822, + "learning_rate": 3.8191631254219927e-07, + "loss": 0.2942, + "step": 20540 + }, + { + "epoch": 2.4744130042143286, + "grad_norm": 5.138599872589111, + "learning_rate": 3.8140585462821296e-07, + "loss": 0.2922, + "step": 20550 + }, + { + "epoch": 2.4756170981336543, + "grad_norm": 5.150256633758545, + "learning_rate": 3.808955276561395e-07, + "loss": 0.3039, + "step": 20560 + }, + { + "epoch": 2.47682119205298, + "grad_norm": 5.677982807159424, + "learning_rate": 3.8038533218943954e-07, + "loss": 0.2928, + "step": 20570 + }, + { + "epoch": 2.478025285972306, + "grad_norm": 4.552664756774902, + "learning_rate": 3.798752687914292e-07, + "loss": 0.3108, + "step": 20580 + }, + { + "epoch": 2.4792293798916316, + "grad_norm": 4.48048210144043, + "learning_rate": 3.7936533802527855e-07, + "loss": 0.3159, + "step": 20590 + }, + { + "epoch": 2.4804334738109572, + "grad_norm": 4.3352370262146, + "learning_rate": 3.7885554045401147e-07, + "loss": 0.3079, + "step": 20600 + }, + { + "epoch": 2.481637567730283, + "grad_norm": 4.1587653160095215, + "learning_rate": 3.783458766405042e-07, + "loss": 0.3036, + "step": 20610 + }, + { + "epoch": 2.4828416616496085, + "grad_norm": 4.668213844299316, + "learning_rate": 3.7783634714748584e-07, + "loss": 0.3003, + "step": 20620 + }, + { + "epoch": 2.4840457555689346, + "grad_norm": 4.186696529388428, + "learning_rate": 3.7732695253753697e-07, + "loss": 0.3192, + "step": 20630 + }, + { + "epoch": 2.48524984948826, + "grad_norm": 4.841115951538086, + "learning_rate": 3.7681769337308954e-07, + "loss": 0.3064, + "step": 20640 + }, + { + "epoch": 2.486453943407586, + "grad_norm": 4.4625020027160645, + "learning_rate": 3.7630857021642514e-07, + "loss": 0.3059, + "step": 20650 + }, + { + "epoch": 2.4876580373269115, + "grad_norm": 4.459711074829102, + "learning_rate": 3.757995836296761e-07, + "loss": 0.2925, + "step": 20660 + }, + { + "epoch": 2.488862131246237, + "grad_norm": 4.983307361602783, + "learning_rate": 3.7529073417482345e-07, + "loss": 0.2961, + "step": 20670 + }, + { + "epoch": 2.4900662251655628, + "grad_norm": 4.813161373138428, + "learning_rate": 3.747820224136973e-07, + "loss": 0.3138, + "step": 20680 + }, + { + "epoch": 2.4912703190848884, + "grad_norm": 4.922794342041016, + "learning_rate": 3.742734489079748e-07, + "loss": 0.3219, + "step": 20690 + }, + { + "epoch": 2.4924744130042145, + "grad_norm": 5.428676128387451, + "learning_rate": 3.737650142191814e-07, + "loss": 0.3077, + "step": 20700 + }, + { + "epoch": 2.49367850692354, + "grad_norm": 4.670940399169922, + "learning_rate": 3.7325671890868895e-07, + "loss": 0.3035, + "step": 20710 + }, + { + "epoch": 2.4948826008428657, + "grad_norm": 4.245230674743652, + "learning_rate": 3.727485635377153e-07, + "loss": 0.3102, + "step": 20720 + }, + { + "epoch": 2.4960866947621914, + "grad_norm": 4.281071186065674, + "learning_rate": 3.7224054866732366e-07, + "loss": 0.2848, + "step": 20730 + }, + { + "epoch": 2.497290788681517, + "grad_norm": 4.969486236572266, + "learning_rate": 3.717326748584227e-07, + "loss": 0.3109, + "step": 20740 + }, + { + "epoch": 2.498494882600843, + "grad_norm": 6.3518500328063965, + "learning_rate": 3.712249426717647e-07, + "loss": 0.321, + "step": 20750 + }, + { + "epoch": 2.4996989765201687, + "grad_norm": 4.896385192871094, + "learning_rate": 3.707173526679458e-07, + "loss": 0.3096, + "step": 20760 + }, + { + "epoch": 2.5009030704394943, + "grad_norm": 4.546391487121582, + "learning_rate": 3.702099054074054e-07, + "loss": 0.3153, + "step": 20770 + }, + { + "epoch": 2.50210716435882, + "grad_norm": 4.817781925201416, + "learning_rate": 3.6970260145042475e-07, + "loss": 0.3072, + "step": 20780 + }, + { + "epoch": 2.5033112582781456, + "grad_norm": 4.495319366455078, + "learning_rate": 3.691954413571276e-07, + "loss": 0.316, + "step": 20790 + }, + { + "epoch": 2.5045153521974717, + "grad_norm": 4.200586318969727, + "learning_rate": 3.6868842568747826e-07, + "loss": 0.3146, + "step": 20800 + }, + { + "epoch": 2.505719446116797, + "grad_norm": 5.999356269836426, + "learning_rate": 3.681815550012816e-07, + "loss": 0.3087, + "step": 20810 + }, + { + "epoch": 2.506923540036123, + "grad_norm": 4.140690326690674, + "learning_rate": 3.676748298581828e-07, + "loss": 0.2786, + "step": 20820 + }, + { + "epoch": 2.5081276339554486, + "grad_norm": 4.519384384155273, + "learning_rate": 3.6716825081766634e-07, + "loss": 0.3073, + "step": 20830 + }, + { + "epoch": 2.509331727874774, + "grad_norm": 4.580509185791016, + "learning_rate": 3.6666181843905477e-07, + "loss": 0.3224, + "step": 20840 + }, + { + "epoch": 2.5105358217941, + "grad_norm": 4.371671676635742, + "learning_rate": 3.661555332815092e-07, + "loss": 0.303, + "step": 20850 + }, + { + "epoch": 2.5117399157134255, + "grad_norm": 5.235719680786133, + "learning_rate": 3.656493959040283e-07, + "loss": 0.3104, + "step": 20860 + }, + { + "epoch": 2.5129440096327516, + "grad_norm": 5.564718246459961, + "learning_rate": 3.651434068654474e-07, + "loss": 0.3111, + "step": 20870 + }, + { + "epoch": 2.514148103552077, + "grad_norm": 4.76020622253418, + "learning_rate": 3.646375667244378e-07, + "loss": 0.3153, + "step": 20880 + }, + { + "epoch": 2.515352197471403, + "grad_norm": 4.534407138824463, + "learning_rate": 3.6413187603950667e-07, + "loss": 0.305, + "step": 20890 + }, + { + "epoch": 2.5165562913907285, + "grad_norm": 5.413814067840576, + "learning_rate": 3.636263353689962e-07, + "loss": 0.3088, + "step": 20900 + }, + { + "epoch": 2.517760385310054, + "grad_norm": 5.003753185272217, + "learning_rate": 3.6312094527108307e-07, + "loss": 0.3146, + "step": 20910 + }, + { + "epoch": 2.51896447922938, + "grad_norm": 5.368070125579834, + "learning_rate": 3.6261570630377713e-07, + "loss": 0.3131, + "step": 20920 + }, + { + "epoch": 2.5201685731487053, + "grad_norm": 5.054159641265869, + "learning_rate": 3.621106190249219e-07, + "loss": 0.2967, + "step": 20930 + }, + { + "epoch": 2.5213726670680314, + "grad_norm": 5.523135185241699, + "learning_rate": 3.616056839921932e-07, + "loss": 0.3154, + "step": 20940 + }, + { + "epoch": 2.522576760987357, + "grad_norm": 5.352376937866211, + "learning_rate": 3.6110090176309914e-07, + "loss": 0.3033, + "step": 20950 + }, + { + "epoch": 2.5237808549066827, + "grad_norm": 3.677163600921631, + "learning_rate": 3.605962728949783e-07, + "loss": 0.3198, + "step": 20960 + }, + { + "epoch": 2.5249849488260083, + "grad_norm": 4.4316840171813965, + "learning_rate": 3.6009179794500067e-07, + "loss": 0.304, + "step": 20970 + }, + { + "epoch": 2.526189042745334, + "grad_norm": 4.927300453186035, + "learning_rate": 3.5958747747016603e-07, + "loss": 0.3221, + "step": 20980 + }, + { + "epoch": 2.52739313666466, + "grad_norm": 5.448822975158691, + "learning_rate": 3.590833120273038e-07, + "loss": 0.3186, + "step": 20990 + }, + { + "epoch": 2.5285972305839857, + "grad_norm": 4.188570022583008, + "learning_rate": 3.5857930217307163e-07, + "loss": 0.3015, + "step": 21000 + }, + { + "epoch": 2.5298013245033113, + "grad_norm": 4.157015323638916, + "learning_rate": 3.580754484639561e-07, + "loss": 0.2909, + "step": 21010 + }, + { + "epoch": 2.531005418422637, + "grad_norm": 4.773519992828369, + "learning_rate": 3.5757175145627107e-07, + "loss": 0.3034, + "step": 21020 + }, + { + "epoch": 2.5322095123419626, + "grad_norm": 5.435080051422119, + "learning_rate": 3.570682117061573e-07, + "loss": 0.3148, + "step": 21030 + }, + { + "epoch": 2.5334136062612886, + "grad_norm": 4.959787368774414, + "learning_rate": 3.56564829769582e-07, + "loss": 0.3115, + "step": 21040 + }, + { + "epoch": 2.534617700180614, + "grad_norm": 4.7358880043029785, + "learning_rate": 3.5606160620233815e-07, + "loss": 0.3078, + "step": 21050 + }, + { + "epoch": 2.53582179409994, + "grad_norm": 4.220034599304199, + "learning_rate": 3.5555854156004404e-07, + "loss": 0.298, + "step": 21060 + }, + { + "epoch": 2.5370258880192655, + "grad_norm": 4.433871746063232, + "learning_rate": 3.550556363981422e-07, + "loss": 0.2809, + "step": 21070 + }, + { + "epoch": 2.538229981938591, + "grad_norm": 4.491239070892334, + "learning_rate": 3.5455289127189907e-07, + "loss": 0.3179, + "step": 21080 + }, + { + "epoch": 2.539434075857917, + "grad_norm": 4.969503879547119, + "learning_rate": 3.540503067364047e-07, + "loss": 0.3018, + "step": 21090 + }, + { + "epoch": 2.5406381697772424, + "grad_norm": 4.266849040985107, + "learning_rate": 3.535478833465717e-07, + "loss": 0.3121, + "step": 21100 + }, + { + "epoch": 2.5418422636965685, + "grad_norm": 4.8507771492004395, + "learning_rate": 3.5304562165713435e-07, + "loss": 0.317, + "step": 21110 + }, + { + "epoch": 2.543046357615894, + "grad_norm": 4.610383987426758, + "learning_rate": 3.525435222226491e-07, + "loss": 0.3083, + "step": 21120 + }, + { + "epoch": 2.54425045153522, + "grad_norm": 4.408012390136719, + "learning_rate": 3.5204158559749275e-07, + "loss": 0.3141, + "step": 21130 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 5.178010940551758, + "learning_rate": 3.5153981233586274e-07, + "loss": 0.3106, + "step": 21140 + }, + { + "epoch": 2.546658639373871, + "grad_norm": 4.6306681632995605, + "learning_rate": 3.5103820299177535e-07, + "loss": 0.3086, + "step": 21150 + }, + { + "epoch": 2.547862733293197, + "grad_norm": 5.366611003875732, + "learning_rate": 3.505367581190668e-07, + "loss": 0.2985, + "step": 21160 + }, + { + "epoch": 2.5490668272125223, + "grad_norm": 5.572306156158447, + "learning_rate": 3.5003547827139125e-07, + "loss": 0.2976, + "step": 21170 + }, + { + "epoch": 2.5502709211318484, + "grad_norm": 5.326085090637207, + "learning_rate": 3.495343640022209e-07, + "loss": 0.2971, + "step": 21180 + }, + { + "epoch": 2.551475015051174, + "grad_norm": 7.600101947784424, + "learning_rate": 3.4903341586484456e-07, + "loss": 0.2961, + "step": 21190 + }, + { + "epoch": 2.5526791089704997, + "grad_norm": 4.568670272827148, + "learning_rate": 3.4853263441236834e-07, + "loss": 0.3142, + "step": 21200 + }, + { + "epoch": 2.5538832028898253, + "grad_norm": 4.9445695877075195, + "learning_rate": 3.480320201977138e-07, + "loss": 0.2988, + "step": 21210 + }, + { + "epoch": 2.555087296809151, + "grad_norm": 5.26786994934082, + "learning_rate": 3.475315737736183e-07, + "loss": 0.3074, + "step": 21220 + }, + { + "epoch": 2.556291390728477, + "grad_norm": 4.316328525543213, + "learning_rate": 3.4703129569263323e-07, + "loss": 0.2917, + "step": 21230 + }, + { + "epoch": 2.5574954846478026, + "grad_norm": 4.018758773803711, + "learning_rate": 3.465311865071248e-07, + "loss": 0.2967, + "step": 21240 + }, + { + "epoch": 2.5586995785671283, + "grad_norm": 5.121528625488281, + "learning_rate": 3.460312467692725e-07, + "loss": 0.3061, + "step": 21250 + }, + { + "epoch": 2.559903672486454, + "grad_norm": 4.710129261016846, + "learning_rate": 3.4553147703106886e-07, + "loss": 0.3074, + "step": 21260 + }, + { + "epoch": 2.5611077664057795, + "grad_norm": 4.447737216949463, + "learning_rate": 3.4503187784431825e-07, + "loss": 0.3062, + "step": 21270 + }, + { + "epoch": 2.5623118603251056, + "grad_norm": 4.8179612159729, + "learning_rate": 3.445324497606372e-07, + "loss": 0.3007, + "step": 21280 + }, + { + "epoch": 2.563515954244431, + "grad_norm": 4.53162956237793, + "learning_rate": 3.440331933314532e-07, + "loss": 0.3103, + "step": 21290 + }, + { + "epoch": 2.564720048163757, + "grad_norm": 4.889903545379639, + "learning_rate": 3.435341091080042e-07, + "loss": 0.3109, + "step": 21300 + }, + { + "epoch": 2.5659241420830825, + "grad_norm": 4.858291149139404, + "learning_rate": 3.430351976413378e-07, + "loss": 0.3191, + "step": 21310 + }, + { + "epoch": 2.567128236002408, + "grad_norm": 4.58107852935791, + "learning_rate": 3.425364594823114e-07, + "loss": 0.2853, + "step": 21320 + }, + { + "epoch": 2.5683323299217338, + "grad_norm": 5.6206207275390625, + "learning_rate": 3.420378951815903e-07, + "loss": 0.3081, + "step": 21330 + }, + { + "epoch": 2.5695364238410594, + "grad_norm": 5.069255352020264, + "learning_rate": 3.4153950528964866e-07, + "loss": 0.3034, + "step": 21340 + }, + { + "epoch": 2.5707405177603855, + "grad_norm": 5.086771488189697, + "learning_rate": 3.4104129035676743e-07, + "loss": 0.318, + "step": 21350 + }, + { + "epoch": 2.571944611679711, + "grad_norm": 5.416161060333252, + "learning_rate": 3.4054325093303447e-07, + "loss": 0.3062, + "step": 21360 + }, + { + "epoch": 2.5731487055990367, + "grad_norm": 4.536307334899902, + "learning_rate": 3.4004538756834415e-07, + "loss": 0.3028, + "step": 21370 + }, + { + "epoch": 2.5743527995183624, + "grad_norm": 4.512822151184082, + "learning_rate": 3.3954770081239657e-07, + "loss": 0.3046, + "step": 21380 + }, + { + "epoch": 2.575556893437688, + "grad_norm": 5.5262322425842285, + "learning_rate": 3.39050191214696e-07, + "loss": 0.3012, + "step": 21390 + }, + { + "epoch": 2.576760987357014, + "grad_norm": 5.3342509269714355, + "learning_rate": 3.38552859324552e-07, + "loss": 0.3046, + "step": 21400 + }, + { + "epoch": 2.5779650812763397, + "grad_norm": 4.271503925323486, + "learning_rate": 3.380557056910778e-07, + "loss": 0.3097, + "step": 21410 + }, + { + "epoch": 2.5791691751956654, + "grad_norm": 4.600352764129639, + "learning_rate": 3.375587308631891e-07, + "loss": 0.3094, + "step": 21420 + }, + { + "epoch": 2.580373269114991, + "grad_norm": 4.630692958831787, + "learning_rate": 3.3706193538960493e-07, + "loss": 0.3117, + "step": 21430 + }, + { + "epoch": 2.5815773630343166, + "grad_norm": 4.425769329071045, + "learning_rate": 3.3656531981884604e-07, + "loss": 0.3097, + "step": 21440 + }, + { + "epoch": 2.5827814569536423, + "grad_norm": 4.963135242462158, + "learning_rate": 3.3606888469923474e-07, + "loss": 0.3079, + "step": 21450 + }, + { + "epoch": 2.583985550872968, + "grad_norm": 5.204167366027832, + "learning_rate": 3.3557263057889344e-07, + "loss": 0.2965, + "step": 21460 + }, + { + "epoch": 2.585189644792294, + "grad_norm": 4.431160926818848, + "learning_rate": 3.3507655800574554e-07, + "loss": 0.2973, + "step": 21470 + }, + { + "epoch": 2.5863937387116196, + "grad_norm": 5.386955261230469, + "learning_rate": 3.345806675275134e-07, + "loss": 0.3035, + "step": 21480 + }, + { + "epoch": 2.5875978326309452, + "grad_norm": 4.363948345184326, + "learning_rate": 3.340849596917189e-07, + "loss": 0.2848, + "step": 21490 + }, + { + "epoch": 2.588801926550271, + "grad_norm": 4.813036918640137, + "learning_rate": 3.3358943504568147e-07, + "loss": 0.3086, + "step": 21500 + }, + { + "epoch": 2.5900060204695965, + "grad_norm": 4.847212791442871, + "learning_rate": 3.3309409413651895e-07, + "loss": 0.2939, + "step": 21510 + }, + { + "epoch": 2.5912101143889226, + "grad_norm": 6.291325569152832, + "learning_rate": 3.3259893751114606e-07, + "loss": 0.3117, + "step": 21520 + }, + { + "epoch": 2.592414208308248, + "grad_norm": 5.317537307739258, + "learning_rate": 3.321039657162742e-07, + "loss": 0.3222, + "step": 21530 + }, + { + "epoch": 2.593618302227574, + "grad_norm": 4.0502190589904785, + "learning_rate": 3.3160917929841027e-07, + "loss": 0.2994, + "step": 21540 + }, + { + "epoch": 2.5948223961468995, + "grad_norm": 5.079105377197266, + "learning_rate": 3.3111457880385686e-07, + "loss": 0.3002, + "step": 21550 + }, + { + "epoch": 2.596026490066225, + "grad_norm": 5.073225975036621, + "learning_rate": 3.3062016477871147e-07, + "loss": 0.2969, + "step": 21560 + }, + { + "epoch": 2.5972305839855507, + "grad_norm": 5.702369689941406, + "learning_rate": 3.3012593776886524e-07, + "loss": 0.3229, + "step": 21570 + }, + { + "epoch": 2.5984346779048764, + "grad_norm": 5.685046672821045, + "learning_rate": 3.296318983200028e-07, + "loss": 0.3149, + "step": 21580 + }, + { + "epoch": 2.5996387718242024, + "grad_norm": 5.351219654083252, + "learning_rate": 3.2913804697760244e-07, + "loss": 0.3116, + "step": 21590 + }, + { + "epoch": 2.600842865743528, + "grad_norm": 4.610897541046143, + "learning_rate": 3.286443842869338e-07, + "loss": 0.3092, + "step": 21600 + }, + { + "epoch": 2.6020469596628537, + "grad_norm": 4.982673168182373, + "learning_rate": 3.2815091079305895e-07, + "loss": 0.2942, + "step": 21610 + }, + { + "epoch": 2.6032510535821793, + "grad_norm": 5.005990982055664, + "learning_rate": 3.2765762704083067e-07, + "loss": 0.311, + "step": 21620 + }, + { + "epoch": 2.604455147501505, + "grad_norm": 4.512310028076172, + "learning_rate": 3.271645335748923e-07, + "loss": 0.3267, + "step": 21630 + }, + { + "epoch": 2.605659241420831, + "grad_norm": 4.117137432098389, + "learning_rate": 3.2667163093967716e-07, + "loss": 0.3003, + "step": 21640 + }, + { + "epoch": 2.6068633353401567, + "grad_norm": 5.019242763519287, + "learning_rate": 3.2617891967940806e-07, + "loss": 0.2979, + "step": 21650 + }, + { + "epoch": 2.6080674292594823, + "grad_norm": 4.304302215576172, + "learning_rate": 3.2568640033809597e-07, + "loss": 0.3009, + "step": 21660 + }, + { + "epoch": 2.609271523178808, + "grad_norm": 5.543119430541992, + "learning_rate": 3.2519407345954043e-07, + "loss": 0.3085, + "step": 21670 + }, + { + "epoch": 2.6104756170981336, + "grad_norm": 4.892364025115967, + "learning_rate": 3.247019395873283e-07, + "loss": 0.2965, + "step": 21680 + }, + { + "epoch": 2.611679711017459, + "grad_norm": 3.9560534954071045, + "learning_rate": 3.242099992648336e-07, + "loss": 0.2994, + "step": 21690 + }, + { + "epoch": 2.612883804936785, + "grad_norm": 4.653574466705322, + "learning_rate": 3.2371825303521604e-07, + "loss": 0.3072, + "step": 21700 + }, + { + "epoch": 2.614087898856111, + "grad_norm": 4.340296268463135, + "learning_rate": 3.232267014414216e-07, + "loss": 0.2965, + "step": 21710 + }, + { + "epoch": 2.6152919927754366, + "grad_norm": 3.889099597930908, + "learning_rate": 3.2273534502618136e-07, + "loss": 0.3212, + "step": 21720 + }, + { + "epoch": 2.616496086694762, + "grad_norm": 4.952009201049805, + "learning_rate": 3.2224418433201033e-07, + "loss": 0.3121, + "step": 21730 + }, + { + "epoch": 2.617700180614088, + "grad_norm": 5.229816913604736, + "learning_rate": 3.2175321990120797e-07, + "loss": 0.304, + "step": 21740 + }, + { + "epoch": 2.6189042745334135, + "grad_norm": 4.951354503631592, + "learning_rate": 3.2126245227585693e-07, + "loss": 0.3024, + "step": 21750 + }, + { + "epoch": 2.6201083684527395, + "grad_norm": 5.034163475036621, + "learning_rate": 3.2077188199782257e-07, + "loss": 0.3057, + "step": 21760 + }, + { + "epoch": 2.621312462372065, + "grad_norm": 5.984414100646973, + "learning_rate": 3.20281509608752e-07, + "loss": 0.3209, + "step": 21770 + }, + { + "epoch": 2.622516556291391, + "grad_norm": 4.373472213745117, + "learning_rate": 3.1979133565007434e-07, + "loss": 0.2947, + "step": 21780 + }, + { + "epoch": 2.6237206502107164, + "grad_norm": 4.750053405761719, + "learning_rate": 3.193013606629994e-07, + "loss": 0.3196, + "step": 21790 + }, + { + "epoch": 2.624924744130042, + "grad_norm": 4.528110027313232, + "learning_rate": 3.188115851885174e-07, + "loss": 0.3053, + "step": 21800 + }, + { + "epoch": 2.6261288380493677, + "grad_norm": 4.8642072677612305, + "learning_rate": 3.1832200976739786e-07, + "loss": 0.3328, + "step": 21810 + }, + { + "epoch": 2.6273329319686933, + "grad_norm": 4.624762535095215, + "learning_rate": 3.1783263494019e-07, + "loss": 0.3123, + "step": 21820 + }, + { + "epoch": 2.6285370258880194, + "grad_norm": 4.700741767883301, + "learning_rate": 3.1734346124722135e-07, + "loss": 0.3011, + "step": 21830 + }, + { + "epoch": 2.629741119807345, + "grad_norm": 5.0118021965026855, + "learning_rate": 3.1685448922859716e-07, + "loss": 0.3163, + "step": 21840 + }, + { + "epoch": 2.6309452137266707, + "grad_norm": 5.321165084838867, + "learning_rate": 3.1636571942420014e-07, + "loss": 0.3019, + "step": 21850 + }, + { + "epoch": 2.6321493076459963, + "grad_norm": 5.864070892333984, + "learning_rate": 3.1587715237368996e-07, + "loss": 0.3027, + "step": 21860 + }, + { + "epoch": 2.633353401565322, + "grad_norm": 4.458745956420898, + "learning_rate": 3.1538878861650194e-07, + "loss": 0.3152, + "step": 21870 + }, + { + "epoch": 2.634557495484648, + "grad_norm": 4.945919036865234, + "learning_rate": 3.149006286918474e-07, + "loss": 0.3238, + "step": 21880 + }, + { + "epoch": 2.6357615894039736, + "grad_norm": 4.671433448791504, + "learning_rate": 3.144126731387126e-07, + "loss": 0.2941, + "step": 21890 + }, + { + "epoch": 2.6369656833232993, + "grad_norm": 5.389127731323242, + "learning_rate": 3.1392492249585744e-07, + "loss": 0.3223, + "step": 21900 + }, + { + "epoch": 2.638169777242625, + "grad_norm": 5.42547607421875, + "learning_rate": 3.134373773018165e-07, + "loss": 0.305, + "step": 21910 + }, + { + "epoch": 2.6393738711619505, + "grad_norm": 5.633350849151611, + "learning_rate": 3.129500380948973e-07, + "loss": 0.296, + "step": 21920 + }, + { + "epoch": 2.640577965081276, + "grad_norm": 4.668237209320068, + "learning_rate": 3.1246290541317937e-07, + "loss": 0.3032, + "step": 21930 + }, + { + "epoch": 2.641782059000602, + "grad_norm": 4.56117057800293, + "learning_rate": 3.119759797945147e-07, + "loss": 0.3036, + "step": 21940 + }, + { + "epoch": 2.642986152919928, + "grad_norm": 5.208002090454102, + "learning_rate": 3.114892617765266e-07, + "loss": 0.2983, + "step": 21950 + }, + { + "epoch": 2.6441902468392535, + "grad_norm": 4.775214195251465, + "learning_rate": 3.110027518966094e-07, + "loss": 0.3104, + "step": 21960 + }, + { + "epoch": 2.645394340758579, + "grad_norm": 4.55642032623291, + "learning_rate": 3.1051645069192675e-07, + "loss": 0.3162, + "step": 21970 + }, + { + "epoch": 2.646598434677905, + "grad_norm": 4.810263156890869, + "learning_rate": 3.1003035869941295e-07, + "loss": 0.2958, + "step": 21980 + }, + { + "epoch": 2.6478025285972304, + "grad_norm": 4.988792896270752, + "learning_rate": 3.0954447645577063e-07, + "loss": 0.308, + "step": 21990 + }, + { + "epoch": 2.6490066225165565, + "grad_norm": 4.394057273864746, + "learning_rate": 3.0905880449747134e-07, + "loss": 0.2995, + "step": 22000 + } + ], + "logging_steps": 10, + "max_steps": 33220, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.886664442836628e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}