shockroborty's picture
Add files using upload-large-folder tool
17173ae verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.24999823739679958,
"eval_steps": 500,
"global_step": 13297,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000376022016089042,
"grad_norm": 69.5,
"learning_rate": 1.9843342036553526e-08,
"loss": 2.5216,
"step": 20
},
{
"epoch": 0.000752044032178084,
"grad_norm": 75.0,
"learning_rate": 4.073107049608355e-08,
"loss": 2.4632,
"step": 40
},
{
"epoch": 0.001128066048267126,
"grad_norm": 109.0,
"learning_rate": 6.161879895561358e-08,
"loss": 2.5604,
"step": 60
},
{
"epoch": 0.001504088064356168,
"grad_norm": 48.5,
"learning_rate": 8.250652741514362e-08,
"loss": 2.4744,
"step": 80
},
{
"epoch": 0.0018801100804452101,
"grad_norm": 45.25,
"learning_rate": 1.0339425587467364e-07,
"loss": 2.5512,
"step": 100
},
{
"epoch": 0.002256132096534252,
"grad_norm": 38.25,
"learning_rate": 1.2428198433420367e-07,
"loss": 2.4959,
"step": 120
},
{
"epoch": 0.002632154112623294,
"grad_norm": 30.5,
"learning_rate": 1.451697127937337e-07,
"loss": 2.5159,
"step": 140
},
{
"epoch": 0.003008176128712336,
"grad_norm": 29.0,
"learning_rate": 1.660574412532637e-07,
"loss": 2.5399,
"step": 160
},
{
"epoch": 0.003384198144801378,
"grad_norm": 29.625,
"learning_rate": 1.8694516971279375e-07,
"loss": 2.4812,
"step": 180
},
{
"epoch": 0.0037602201608904202,
"grad_norm": 25.0,
"learning_rate": 2.0783289817232378e-07,
"loss": 2.4797,
"step": 200
},
{
"epoch": 0.004136242176979462,
"grad_norm": 24.625,
"learning_rate": 2.2872062663185383e-07,
"loss": 2.4898,
"step": 220
},
{
"epoch": 0.004512264193068504,
"grad_norm": 17.0,
"learning_rate": 2.4960835509138383e-07,
"loss": 2.4359,
"step": 240
},
{
"epoch": 0.004888286209157546,
"grad_norm": 39.75,
"learning_rate": 2.7049608355091385e-07,
"loss": 2.4451,
"step": 260
},
{
"epoch": 0.005264308225246588,
"grad_norm": 17.75,
"learning_rate": 2.913838120104439e-07,
"loss": 2.4747,
"step": 280
},
{
"epoch": 0.00564033024133563,
"grad_norm": 19.75,
"learning_rate": 3.122715404699739e-07,
"loss": 2.4672,
"step": 300
},
{
"epoch": 0.006016352257424672,
"grad_norm": 27.125,
"learning_rate": 3.3315926892950393e-07,
"loss": 2.44,
"step": 320
},
{
"epoch": 0.006392374273513714,
"grad_norm": 19.75,
"learning_rate": 3.5404699738903396e-07,
"loss": 2.494,
"step": 340
},
{
"epoch": 0.006768396289602756,
"grad_norm": 39.25,
"learning_rate": 3.7493472584856404e-07,
"loss": 2.4068,
"step": 360
},
{
"epoch": 0.007144418305691798,
"grad_norm": 27.75,
"learning_rate": 3.95822454308094e-07,
"loss": 2.3509,
"step": 380
},
{
"epoch": 0.0075204403217808405,
"grad_norm": 14.9375,
"learning_rate": 4.1671018276762403e-07,
"loss": 2.3596,
"step": 400
},
{
"epoch": 0.007896462337869883,
"grad_norm": 21.5,
"learning_rate": 4.375979112271541e-07,
"loss": 2.4322,
"step": 420
},
{
"epoch": 0.008272484353958925,
"grad_norm": 17.25,
"learning_rate": 4.584856396866841e-07,
"loss": 2.4769,
"step": 440
},
{
"epoch": 0.008648506370047966,
"grad_norm": 24.375,
"learning_rate": 4.793733681462142e-07,
"loss": 2.3957,
"step": 460
},
{
"epoch": 0.009024528386137008,
"grad_norm": 16.875,
"learning_rate": 5.002610966057442e-07,
"loss": 2.4445,
"step": 480
},
{
"epoch": 0.00940055040222605,
"grad_norm": 21.75,
"learning_rate": 5.211488250652742e-07,
"loss": 2.4009,
"step": 500
},
{
"epoch": 0.009776572418315092,
"grad_norm": 26.375,
"learning_rate": 5.420365535248042e-07,
"loss": 2.3618,
"step": 520
},
{
"epoch": 0.010152594434404135,
"grad_norm": 24.5,
"learning_rate": 5.629242819843343e-07,
"loss": 2.3718,
"step": 540
},
{
"epoch": 0.010528616450493177,
"grad_norm": 19.875,
"learning_rate": 5.838120104438643e-07,
"loss": 2.3708,
"step": 560
},
{
"epoch": 0.010904638466582219,
"grad_norm": 18.875,
"learning_rate": 6.046997389033943e-07,
"loss": 2.4253,
"step": 580
},
{
"epoch": 0.01128066048267126,
"grad_norm": 28.0,
"learning_rate": 6.255874673629243e-07,
"loss": 2.3592,
"step": 600
},
{
"epoch": 0.011656682498760302,
"grad_norm": 32.25,
"learning_rate": 6.464751958224544e-07,
"loss": 2.3199,
"step": 620
},
{
"epoch": 0.012032704514849344,
"grad_norm": 33.25,
"learning_rate": 6.673629242819844e-07,
"loss": 2.3505,
"step": 640
},
{
"epoch": 0.012408726530938387,
"grad_norm": 41.0,
"learning_rate": 6.882506527415145e-07,
"loss": 2.3872,
"step": 660
},
{
"epoch": 0.012784748547027429,
"grad_norm": 15.625,
"learning_rate": 7.091383812010443e-07,
"loss": 2.3008,
"step": 680
},
{
"epoch": 0.01316077056311647,
"grad_norm": 31.5,
"learning_rate": 7.300261096605745e-07,
"loss": 2.168,
"step": 700
},
{
"epoch": 0.013536792579205512,
"grad_norm": 71.0,
"learning_rate": 7.509138381201045e-07,
"loss": 2.2318,
"step": 720
},
{
"epoch": 0.013912814595294554,
"grad_norm": 32.25,
"learning_rate": 7.718015665796345e-07,
"loss": 2.2759,
"step": 740
},
{
"epoch": 0.014288836611383596,
"grad_norm": 71.0,
"learning_rate": 7.926892950391646e-07,
"loss": 2.2838,
"step": 760
},
{
"epoch": 0.01466485862747264,
"grad_norm": 37.0,
"learning_rate": 8.135770234986947e-07,
"loss": 2.2449,
"step": 780
},
{
"epoch": 0.015040880643561681,
"grad_norm": 100.0,
"learning_rate": 8.344647519582245e-07,
"loss": 2.2566,
"step": 800
},
{
"epoch": 0.015416902659650723,
"grad_norm": 52.5,
"learning_rate": 8.553524804177546e-07,
"loss": 2.2765,
"step": 820
},
{
"epoch": 0.015792924675739766,
"grad_norm": 30.0,
"learning_rate": 8.762402088772847e-07,
"loss": 2.2282,
"step": 840
},
{
"epoch": 0.016168946691828806,
"grad_norm": 22.75,
"learning_rate": 8.971279373368147e-07,
"loss": 2.2817,
"step": 860
},
{
"epoch": 0.01654496870791785,
"grad_norm": 70.5,
"learning_rate": 9.180156657963447e-07,
"loss": 2.209,
"step": 880
},
{
"epoch": 0.01692099072400689,
"grad_norm": 107.0,
"learning_rate": 9.389033942558748e-07,
"loss": 2.2978,
"step": 900
},
{
"epoch": 0.017297012740095933,
"grad_norm": 52.5,
"learning_rate": 9.597911227154048e-07,
"loss": 2.2589,
"step": 920
},
{
"epoch": 0.017673034756184973,
"grad_norm": 90.0,
"learning_rate": 9.806788511749348e-07,
"loss": 2.1987,
"step": 940
},
{
"epoch": 0.018049056772274016,
"grad_norm": 45.75,
"learning_rate": 1.0015665796344648e-06,
"loss": 2.1721,
"step": 960
},
{
"epoch": 0.01842507878836306,
"grad_norm": 191.0,
"learning_rate": 1.0224543080939948e-06,
"loss": 2.2062,
"step": 980
},
{
"epoch": 0.0188011008044521,
"grad_norm": 67.5,
"learning_rate": 1.0433420365535249e-06,
"loss": 2.1984,
"step": 1000
},
{
"epoch": 0.019177122820541143,
"grad_norm": 61.25,
"learning_rate": 1.0642297650130549e-06,
"loss": 2.2189,
"step": 1020
},
{
"epoch": 0.019553144836630183,
"grad_norm": 49.5,
"learning_rate": 1.085117493472585e-06,
"loss": 2.2025,
"step": 1040
},
{
"epoch": 0.019929166852719227,
"grad_norm": 80.5,
"learning_rate": 1.1060052219321151e-06,
"loss": 2.1776,
"step": 1060
},
{
"epoch": 0.02030518886880827,
"grad_norm": 139.0,
"learning_rate": 1.126892950391645e-06,
"loss": 2.1829,
"step": 1080
},
{
"epoch": 0.02068121088489731,
"grad_norm": 127.0,
"learning_rate": 1.147780678851175e-06,
"loss": 2.1865,
"step": 1100
},
{
"epoch": 0.021057232900986354,
"grad_norm": 163.0,
"learning_rate": 1.168668407310705e-06,
"loss": 2.1854,
"step": 1120
},
{
"epoch": 0.021433254917075394,
"grad_norm": 71.5,
"learning_rate": 1.189556135770235e-06,
"loss": 2.163,
"step": 1140
},
{
"epoch": 0.021809276933164437,
"grad_norm": 98.0,
"learning_rate": 1.210443864229765e-06,
"loss": 2.071,
"step": 1160
},
{
"epoch": 0.02218529894925348,
"grad_norm": 165.0,
"learning_rate": 1.2313315926892953e-06,
"loss": 2.145,
"step": 1180
},
{
"epoch": 0.02256132096534252,
"grad_norm": 35.5,
"learning_rate": 1.2522193211488251e-06,
"loss": 2.0652,
"step": 1200
},
{
"epoch": 0.022937342981431564,
"grad_norm": 115.5,
"learning_rate": 1.2731070496083554e-06,
"loss": 2.1296,
"step": 1220
},
{
"epoch": 0.023313364997520604,
"grad_norm": 90.0,
"learning_rate": 1.2939947780678852e-06,
"loss": 2.0437,
"step": 1240
},
{
"epoch": 0.023689387013609647,
"grad_norm": 45.0,
"learning_rate": 1.3148825065274152e-06,
"loss": 2.0833,
"step": 1260
},
{
"epoch": 0.024065409029698687,
"grad_norm": 32.75,
"learning_rate": 1.3357702349869452e-06,
"loss": 2.0352,
"step": 1280
},
{
"epoch": 0.02444143104578773,
"grad_norm": 153.0,
"learning_rate": 1.3566579634464752e-06,
"loss": 2.054,
"step": 1300
},
{
"epoch": 0.024817453061876774,
"grad_norm": 149.0,
"learning_rate": 1.3775456919060055e-06,
"loss": 2.0609,
"step": 1320
},
{
"epoch": 0.025193475077965814,
"grad_norm": 50.75,
"learning_rate": 1.3984334203655353e-06,
"loss": 1.97,
"step": 1340
},
{
"epoch": 0.025569497094054858,
"grad_norm": 121.5,
"learning_rate": 1.4193211488250655e-06,
"loss": 1.9576,
"step": 1360
},
{
"epoch": 0.025945519110143898,
"grad_norm": 31.0,
"learning_rate": 1.4402088772845953e-06,
"loss": 2.0156,
"step": 1380
},
{
"epoch": 0.02632154112623294,
"grad_norm": 133.0,
"learning_rate": 1.4610966057441254e-06,
"loss": 2.0576,
"step": 1400
},
{
"epoch": 0.026697563142321985,
"grad_norm": 41.75,
"learning_rate": 1.4819843342036556e-06,
"loss": 2.0427,
"step": 1420
},
{
"epoch": 0.027073585158411025,
"grad_norm": 89.5,
"learning_rate": 1.5028720626631854e-06,
"loss": 1.9948,
"step": 1440
},
{
"epoch": 0.027449607174500068,
"grad_norm": 123.5,
"learning_rate": 1.5237597911227157e-06,
"loss": 1.987,
"step": 1460
},
{
"epoch": 0.027825629190589108,
"grad_norm": 72.0,
"learning_rate": 1.5446475195822455e-06,
"loss": 2.011,
"step": 1480
},
{
"epoch": 0.02820165120667815,
"grad_norm": 58.5,
"learning_rate": 1.5655352480417757e-06,
"loss": 1.9926,
"step": 1500
},
{
"epoch": 0.02857767322276719,
"grad_norm": 50.5,
"learning_rate": 1.5864229765013055e-06,
"loss": 1.9495,
"step": 1520
},
{
"epoch": 0.028953695238856235,
"grad_norm": 58.5,
"learning_rate": 1.6073107049608356e-06,
"loss": 1.9988,
"step": 1540
},
{
"epoch": 0.02932971725494528,
"grad_norm": 76.0,
"learning_rate": 1.6281984334203658e-06,
"loss": 1.98,
"step": 1560
},
{
"epoch": 0.02970573927103432,
"grad_norm": 99.0,
"learning_rate": 1.6490861618798956e-06,
"loss": 1.9849,
"step": 1580
},
{
"epoch": 0.030081761287123362,
"grad_norm": 116.0,
"learning_rate": 1.6699738903394258e-06,
"loss": 1.9464,
"step": 1600
},
{
"epoch": 0.030457783303212402,
"grad_norm": 119.0,
"learning_rate": 1.6908616187989557e-06,
"loss": 1.9654,
"step": 1620
},
{
"epoch": 0.030833805319301445,
"grad_norm": 130.0,
"learning_rate": 1.7117493472584859e-06,
"loss": 1.9718,
"step": 1640
},
{
"epoch": 0.03120982733539049,
"grad_norm": 200.0,
"learning_rate": 1.732637075718016e-06,
"loss": 1.9127,
"step": 1660
},
{
"epoch": 0.03158584935147953,
"grad_norm": 228.0,
"learning_rate": 1.7535248041775457e-06,
"loss": 1.9649,
"step": 1680
},
{
"epoch": 0.03196187136756857,
"grad_norm": 159.0,
"learning_rate": 1.774412532637076e-06,
"loss": 1.8952,
"step": 1700
},
{
"epoch": 0.03233789338365761,
"grad_norm": 87.0,
"learning_rate": 1.7953002610966058e-06,
"loss": 1.9328,
"step": 1720
},
{
"epoch": 0.032713915399746656,
"grad_norm": 129.0,
"learning_rate": 1.816187989556136e-06,
"loss": 1.9531,
"step": 1740
},
{
"epoch": 0.0330899374158357,
"grad_norm": 187.0,
"learning_rate": 1.8370757180156658e-06,
"loss": 1.8911,
"step": 1760
},
{
"epoch": 0.03346595943192474,
"grad_norm": 140.0,
"learning_rate": 1.857963446475196e-06,
"loss": 1.9228,
"step": 1780
},
{
"epoch": 0.03384198144801378,
"grad_norm": 152.0,
"learning_rate": 1.878851174934726e-06,
"loss": 1.9264,
"step": 1800
},
{
"epoch": 0.03421800346410282,
"grad_norm": 112.0,
"learning_rate": 1.899738903394256e-06,
"loss": 1.957,
"step": 1820
},
{
"epoch": 0.034594025480191866,
"grad_norm": 81.5,
"learning_rate": 1.920626631853786e-06,
"loss": 1.9084,
"step": 1840
},
{
"epoch": 0.03497004749628091,
"grad_norm": 52.25,
"learning_rate": 1.941514360313316e-06,
"loss": 1.895,
"step": 1860
},
{
"epoch": 0.035346069512369946,
"grad_norm": 52.75,
"learning_rate": 1.9624020887728464e-06,
"loss": 1.8667,
"step": 1880
},
{
"epoch": 0.03572209152845899,
"grad_norm": 73.0,
"learning_rate": 1.9832898172323762e-06,
"loss": 1.8782,
"step": 1900
},
{
"epoch": 0.03609811354454803,
"grad_norm": 157.0,
"learning_rate": 2.004177545691906e-06,
"loss": 1.8986,
"step": 1920
},
{
"epoch": 0.036474135560637076,
"grad_norm": 124.5,
"learning_rate": 2.0250652741514363e-06,
"loss": 1.8863,
"step": 1940
},
{
"epoch": 0.03685015757672612,
"grad_norm": 211.0,
"learning_rate": 2.045953002610966e-06,
"loss": 1.8851,
"step": 1960
},
{
"epoch": 0.037226179592815156,
"grad_norm": 157.0,
"learning_rate": 2.0668407310704963e-06,
"loss": 1.8669,
"step": 1980
},
{
"epoch": 0.0376022016089042,
"grad_norm": 163.0,
"learning_rate": 2.087728459530026e-06,
"loss": 1.873,
"step": 2000
},
{
"epoch": 0.03797822362499324,
"grad_norm": 76.5,
"learning_rate": 2.1086161879895564e-06,
"loss": 1.8493,
"step": 2020
},
{
"epoch": 0.03835424564108229,
"grad_norm": 250.0,
"learning_rate": 2.129503916449086e-06,
"loss": 1.8835,
"step": 2040
},
{
"epoch": 0.03873026765717133,
"grad_norm": 86.0,
"learning_rate": 2.1503916449086164e-06,
"loss": 1.8291,
"step": 2060
},
{
"epoch": 0.03910628967326037,
"grad_norm": 81.5,
"learning_rate": 2.1712793733681462e-06,
"loss": 1.8068,
"step": 2080
},
{
"epoch": 0.03948231168934941,
"grad_norm": 94.5,
"learning_rate": 2.1921671018276765e-06,
"loss": 1.7797,
"step": 2100
},
{
"epoch": 0.03985833370543845,
"grad_norm": 153.0,
"learning_rate": 2.2130548302872067e-06,
"loss": 1.8606,
"step": 2120
},
{
"epoch": 0.0402343557215275,
"grad_norm": 160.0,
"learning_rate": 2.2339425587467365e-06,
"loss": 1.8179,
"step": 2140
},
{
"epoch": 0.04061037773761654,
"grad_norm": 252.0,
"learning_rate": 2.2548302872062668e-06,
"loss": 1.8003,
"step": 2160
},
{
"epoch": 0.04098639975370558,
"grad_norm": 272.0,
"learning_rate": 2.2757180156657966e-06,
"loss": 1.7933,
"step": 2180
},
{
"epoch": 0.04136242176979462,
"grad_norm": 66.5,
"learning_rate": 2.2966057441253264e-06,
"loss": 1.8021,
"step": 2200
},
{
"epoch": 0.041738443785883664,
"grad_norm": 132.0,
"learning_rate": 2.3174934725848566e-06,
"loss": 1.757,
"step": 2220
},
{
"epoch": 0.04211446580197271,
"grad_norm": 101.0,
"learning_rate": 2.3383812010443865e-06,
"loss": 1.7466,
"step": 2240
},
{
"epoch": 0.04249048781806175,
"grad_norm": 111.5,
"learning_rate": 2.3592689295039167e-06,
"loss": 1.7771,
"step": 2260
},
{
"epoch": 0.04286650983415079,
"grad_norm": 64.0,
"learning_rate": 2.3801566579634465e-06,
"loss": 1.754,
"step": 2280
},
{
"epoch": 0.04324253185023983,
"grad_norm": 220.0,
"learning_rate": 2.4010443864229767e-06,
"loss": 1.7484,
"step": 2300
},
{
"epoch": 0.043618553866328874,
"grad_norm": 97.5,
"learning_rate": 2.4219321148825066e-06,
"loss": 1.7204,
"step": 2320
},
{
"epoch": 0.04399457588241792,
"grad_norm": 110.5,
"learning_rate": 2.442819843342037e-06,
"loss": 1.7732,
"step": 2340
},
{
"epoch": 0.04437059789850696,
"grad_norm": 73.5,
"learning_rate": 2.463707571801567e-06,
"loss": 1.7447,
"step": 2360
},
{
"epoch": 0.044746619914596,
"grad_norm": 78.5,
"learning_rate": 2.484595300261097e-06,
"loss": 1.7127,
"step": 2380
},
{
"epoch": 0.04512264193068504,
"grad_norm": 63.25,
"learning_rate": 2.5054830287206267e-06,
"loss": 1.6951,
"step": 2400
},
{
"epoch": 0.045498663946774084,
"grad_norm": 56.25,
"learning_rate": 2.5263707571801573e-06,
"loss": 1.6848,
"step": 2420
},
{
"epoch": 0.04587468596286313,
"grad_norm": 69.0,
"learning_rate": 2.547258485639687e-06,
"loss": 1.7051,
"step": 2440
},
{
"epoch": 0.046250707978952164,
"grad_norm": 83.0,
"learning_rate": 2.568146214099217e-06,
"loss": 1.6354,
"step": 2460
},
{
"epoch": 0.04662672999504121,
"grad_norm": 90.0,
"learning_rate": 2.5890339425587468e-06,
"loss": 1.643,
"step": 2480
},
{
"epoch": 0.04700275201113025,
"grad_norm": 62.5,
"learning_rate": 2.6099216710182766e-06,
"loss": 1.6811,
"step": 2500
},
{
"epoch": 0.047378774027219295,
"grad_norm": 199.0,
"learning_rate": 2.6308093994778072e-06,
"loss": 1.6851,
"step": 2520
},
{
"epoch": 0.04775479604330834,
"grad_norm": 57.75,
"learning_rate": 2.651697127937337e-06,
"loss": 1.6055,
"step": 2540
},
{
"epoch": 0.048130818059397375,
"grad_norm": 196.0,
"learning_rate": 2.672584856396867e-06,
"loss": 1.6079,
"step": 2560
},
{
"epoch": 0.04850684007548642,
"grad_norm": 149.0,
"learning_rate": 2.693472584856397e-06,
"loss": 1.6273,
"step": 2580
},
{
"epoch": 0.04888286209157546,
"grad_norm": 95.5,
"learning_rate": 2.714360313315927e-06,
"loss": 1.6333,
"step": 2600
},
{
"epoch": 0.049258884107664505,
"grad_norm": 72.0,
"learning_rate": 2.735248041775457e-06,
"loss": 1.6026,
"step": 2620
},
{
"epoch": 0.04963490612375355,
"grad_norm": 338.0,
"learning_rate": 2.7561357702349874e-06,
"loss": 1.5909,
"step": 2640
},
{
"epoch": 0.050010928139842585,
"grad_norm": 65.5,
"learning_rate": 2.777023498694517e-06,
"loss": 1.6058,
"step": 2660
},
{
"epoch": 0.05038695015593163,
"grad_norm": 126.0,
"learning_rate": 2.797911227154047e-06,
"loss": 1.5821,
"step": 2680
},
{
"epoch": 0.05076297217202067,
"grad_norm": 142.0,
"learning_rate": 2.8187989556135777e-06,
"loss": 1.5928,
"step": 2700
},
{
"epoch": 0.051138994188109715,
"grad_norm": 59.0,
"learning_rate": 2.8396866840731075e-06,
"loss": 1.5513,
"step": 2720
},
{
"epoch": 0.05151501620419876,
"grad_norm": 173.0,
"learning_rate": 2.8605744125326373e-06,
"loss": 1.519,
"step": 2740
},
{
"epoch": 0.051891038220287795,
"grad_norm": 118.5,
"learning_rate": 2.881462140992167e-06,
"loss": 1.5389,
"step": 2760
},
{
"epoch": 0.05226706023637684,
"grad_norm": 121.5,
"learning_rate": 2.9023498694516974e-06,
"loss": 1.5027,
"step": 2780
},
{
"epoch": 0.05264308225246588,
"grad_norm": 71.5,
"learning_rate": 2.9232375979112276e-06,
"loss": 1.5588,
"step": 2800
},
{
"epoch": 0.053019104268554926,
"grad_norm": 148.0,
"learning_rate": 2.9441253263707574e-06,
"loss": 1.544,
"step": 2820
},
{
"epoch": 0.05339512628464397,
"grad_norm": 98.5,
"learning_rate": 2.9650130548302876e-06,
"loss": 1.4796,
"step": 2840
},
{
"epoch": 0.053771148300733006,
"grad_norm": 119.5,
"learning_rate": 2.9859007832898175e-06,
"loss": 1.5498,
"step": 2860
},
{
"epoch": 0.05414717031682205,
"grad_norm": 68.0,
"learning_rate": 3.0067885117493473e-06,
"loss": 1.5174,
"step": 2880
},
{
"epoch": 0.05452319233291109,
"grad_norm": 81.0,
"learning_rate": 3.027676240208878e-06,
"loss": 1.5218,
"step": 2900
},
{
"epoch": 0.054899214349000136,
"grad_norm": 89.5,
"learning_rate": 3.0485639686684078e-06,
"loss": 1.4837,
"step": 2920
},
{
"epoch": 0.05527523636508917,
"grad_norm": 175.0,
"learning_rate": 3.0694516971279376e-06,
"loss": 1.469,
"step": 2940
},
{
"epoch": 0.055651258381178216,
"grad_norm": 188.0,
"learning_rate": 3.0903394255874674e-06,
"loss": 1.4704,
"step": 2960
},
{
"epoch": 0.05602728039726726,
"grad_norm": 53.5,
"learning_rate": 3.111227154046997e-06,
"loss": 1.4528,
"step": 2980
},
{
"epoch": 0.0564033024133563,
"grad_norm": 91.0,
"learning_rate": 3.132114882506528e-06,
"loss": 1.4783,
"step": 3000
},
{
"epoch": 0.056779324429445346,
"grad_norm": 81.5,
"learning_rate": 3.1530026109660577e-06,
"loss": 1.4367,
"step": 3020
},
{
"epoch": 0.05715534644553438,
"grad_norm": 69.0,
"learning_rate": 3.1738903394255875e-06,
"loss": 1.4717,
"step": 3040
},
{
"epoch": 0.057531368461623426,
"grad_norm": 207.0,
"learning_rate": 3.1947780678851177e-06,
"loss": 1.4713,
"step": 3060
},
{
"epoch": 0.05790739047771247,
"grad_norm": 87.5,
"learning_rate": 3.215665796344648e-06,
"loss": 1.4269,
"step": 3080
},
{
"epoch": 0.05828341249380151,
"grad_norm": 87.0,
"learning_rate": 3.2365535248041778e-06,
"loss": 1.4116,
"step": 3100
},
{
"epoch": 0.05865943450989056,
"grad_norm": 133.0,
"learning_rate": 3.257441253263708e-06,
"loss": 1.423,
"step": 3120
},
{
"epoch": 0.05903545652597959,
"grad_norm": 66.0,
"learning_rate": 3.278328981723238e-06,
"loss": 1.3921,
"step": 3140
},
{
"epoch": 0.05941147854206864,
"grad_norm": 70.0,
"learning_rate": 3.2992167101827676e-06,
"loss": 1.4027,
"step": 3160
},
{
"epoch": 0.05978750055815768,
"grad_norm": 114.5,
"learning_rate": 3.3201044386422983e-06,
"loss": 1.4118,
"step": 3180
},
{
"epoch": 0.060163522574246724,
"grad_norm": 64.5,
"learning_rate": 3.340992167101828e-06,
"loss": 1.3805,
"step": 3200
},
{
"epoch": 0.06053954459033577,
"grad_norm": 76.0,
"learning_rate": 3.361879895561358e-06,
"loss": 1.3966,
"step": 3220
},
{
"epoch": 0.060915566606424804,
"grad_norm": 52.5,
"learning_rate": 3.3827676240208877e-06,
"loss": 1.3991,
"step": 3240
},
{
"epoch": 0.06129158862251385,
"grad_norm": 139.0,
"learning_rate": 3.403655352480418e-06,
"loss": 1.3851,
"step": 3260
},
{
"epoch": 0.06166761063860289,
"grad_norm": 56.25,
"learning_rate": 3.4245430809399482e-06,
"loss": 1.3506,
"step": 3280
},
{
"epoch": 0.062043632654691934,
"grad_norm": 86.0,
"learning_rate": 3.445430809399478e-06,
"loss": 1.3288,
"step": 3300
},
{
"epoch": 0.06241965467078098,
"grad_norm": 108.5,
"learning_rate": 3.4663185378590083e-06,
"loss": 1.3767,
"step": 3320
},
{
"epoch": 0.06279567668687001,
"grad_norm": 46.5,
"learning_rate": 3.487206266318538e-06,
"loss": 1.339,
"step": 3340
},
{
"epoch": 0.06317169870295906,
"grad_norm": 127.5,
"learning_rate": 3.5080939947780683e-06,
"loss": 1.3316,
"step": 3360
},
{
"epoch": 0.0635477207190481,
"grad_norm": 53.0,
"learning_rate": 3.5289817232375986e-06,
"loss": 1.362,
"step": 3380
},
{
"epoch": 0.06392374273513714,
"grad_norm": 75.5,
"learning_rate": 3.5498694516971284e-06,
"loss": 1.3073,
"step": 3400
},
{
"epoch": 0.06429976475122619,
"grad_norm": 103.5,
"learning_rate": 3.570757180156658e-06,
"loss": 1.3008,
"step": 3420
},
{
"epoch": 0.06467578676731522,
"grad_norm": 50.25,
"learning_rate": 3.591644908616188e-06,
"loss": 1.3438,
"step": 3440
},
{
"epoch": 0.06505180878340427,
"grad_norm": 108.0,
"learning_rate": 3.6125326370757187e-06,
"loss": 1.3175,
"step": 3460
},
{
"epoch": 0.06542783079949331,
"grad_norm": 97.0,
"learning_rate": 3.6334203655352485e-06,
"loss": 1.3031,
"step": 3480
},
{
"epoch": 0.06580385281558235,
"grad_norm": 124.5,
"learning_rate": 3.6543080939947783e-06,
"loss": 1.3161,
"step": 3500
},
{
"epoch": 0.0661798748316714,
"grad_norm": 78.5,
"learning_rate": 3.675195822454308e-06,
"loss": 1.2822,
"step": 3520
},
{
"epoch": 0.06655589684776043,
"grad_norm": 81.5,
"learning_rate": 3.6960835509138383e-06,
"loss": 1.3111,
"step": 3540
},
{
"epoch": 0.06693191886384948,
"grad_norm": 94.5,
"learning_rate": 3.7169712793733686e-06,
"loss": 1.2909,
"step": 3560
},
{
"epoch": 0.06730794087993852,
"grad_norm": 86.0,
"learning_rate": 3.7378590078328984e-06,
"loss": 1.2535,
"step": 3580
},
{
"epoch": 0.06768396289602756,
"grad_norm": 142.0,
"learning_rate": 3.7587467362924286e-06,
"loss": 1.2963,
"step": 3600
},
{
"epoch": 0.06805998491211661,
"grad_norm": 58.25,
"learning_rate": 3.7796344647519584e-06,
"loss": 1.2354,
"step": 3620
},
{
"epoch": 0.06843600692820564,
"grad_norm": 67.5,
"learning_rate": 3.8005221932114883e-06,
"loss": 1.2719,
"step": 3640
},
{
"epoch": 0.0688120289442947,
"grad_norm": 103.0,
"learning_rate": 3.821409921671019e-06,
"loss": 1.246,
"step": 3660
},
{
"epoch": 0.06918805096038373,
"grad_norm": 110.0,
"learning_rate": 3.842297650130548e-06,
"loss": 1.2397,
"step": 3680
},
{
"epoch": 0.06956407297647277,
"grad_norm": 45.5,
"learning_rate": 3.8631853785900785e-06,
"loss": 1.2576,
"step": 3700
},
{
"epoch": 0.06994009499256182,
"grad_norm": 62.0,
"learning_rate": 3.884073107049609e-06,
"loss": 1.2273,
"step": 3720
},
{
"epoch": 0.07031611700865086,
"grad_norm": 77.5,
"learning_rate": 3.904960835509139e-06,
"loss": 1.2366,
"step": 3740
},
{
"epoch": 0.07069213902473989,
"grad_norm": 89.5,
"learning_rate": 3.925848563968669e-06,
"loss": 1.2027,
"step": 3760
},
{
"epoch": 0.07106816104082894,
"grad_norm": 84.0,
"learning_rate": 3.946736292428199e-06,
"loss": 1.2029,
"step": 3780
},
{
"epoch": 0.07144418305691798,
"grad_norm": 51.0,
"learning_rate": 3.967624020887729e-06,
"loss": 1.2181,
"step": 3800
},
{
"epoch": 0.07182020507300703,
"grad_norm": 71.0,
"learning_rate": 3.988511749347258e-06,
"loss": 1.2405,
"step": 3820
},
{
"epoch": 0.07219622708909607,
"grad_norm": 78.0,
"learning_rate": 4.009399477806789e-06,
"loss": 1.1956,
"step": 3840
},
{
"epoch": 0.0725722491051851,
"grad_norm": 76.0,
"learning_rate": 4.030287206266319e-06,
"loss": 1.2008,
"step": 3860
},
{
"epoch": 0.07294827112127415,
"grad_norm": 71.0,
"learning_rate": 4.051174934725849e-06,
"loss": 1.209,
"step": 3880
},
{
"epoch": 0.07332429313736319,
"grad_norm": 91.0,
"learning_rate": 4.072062663185378e-06,
"loss": 1.2152,
"step": 3900
},
{
"epoch": 0.07370031515345224,
"grad_norm": 65.5,
"learning_rate": 4.092950391644909e-06,
"loss": 1.2116,
"step": 3920
},
{
"epoch": 0.07407633716954128,
"grad_norm": 84.0,
"learning_rate": 4.113838120104439e-06,
"loss": 1.1892,
"step": 3940
},
{
"epoch": 0.07445235918563031,
"grad_norm": 80.0,
"learning_rate": 4.134725848563969e-06,
"loss": 1.1591,
"step": 3960
},
{
"epoch": 0.07482838120171936,
"grad_norm": 93.0,
"learning_rate": 4.155613577023499e-06,
"loss": 1.1767,
"step": 3980
},
{
"epoch": 0.0752044032178084,
"grad_norm": 83.0,
"learning_rate": 4.176501305483029e-06,
"loss": 1.1395,
"step": 4000
},
{
"epoch": 0.07558042523389745,
"grad_norm": 72.5,
"learning_rate": 4.197389033942559e-06,
"loss": 1.1689,
"step": 4020
},
{
"epoch": 0.07595644724998649,
"grad_norm": 58.5,
"learning_rate": 4.218276762402089e-06,
"loss": 1.1351,
"step": 4040
},
{
"epoch": 0.07633246926607552,
"grad_norm": 64.5,
"learning_rate": 4.2391644908616194e-06,
"loss": 1.1314,
"step": 4060
},
{
"epoch": 0.07670849128216457,
"grad_norm": 66.5,
"learning_rate": 4.260052219321149e-06,
"loss": 1.124,
"step": 4080
},
{
"epoch": 0.07708451329825361,
"grad_norm": 42.75,
"learning_rate": 4.280939947780679e-06,
"loss": 1.0991,
"step": 4100
},
{
"epoch": 0.07746053531434266,
"grad_norm": 63.25,
"learning_rate": 4.301827676240209e-06,
"loss": 1.1353,
"step": 4120
},
{
"epoch": 0.0778365573304317,
"grad_norm": 47.5,
"learning_rate": 4.3227154046997395e-06,
"loss": 1.1189,
"step": 4140
},
{
"epoch": 0.07821257934652073,
"grad_norm": 34.0,
"learning_rate": 4.343603133159269e-06,
"loss": 1.1196,
"step": 4160
},
{
"epoch": 0.07858860136260978,
"grad_norm": 140.0,
"learning_rate": 4.364490861618799e-06,
"loss": 1.0964,
"step": 4180
},
{
"epoch": 0.07896462337869882,
"grad_norm": 60.75,
"learning_rate": 4.385378590078329e-06,
"loss": 1.1315,
"step": 4200
},
{
"epoch": 0.07934064539478787,
"grad_norm": 57.75,
"learning_rate": 4.40626631853786e-06,
"loss": 1.0911,
"step": 4220
},
{
"epoch": 0.0797166674108769,
"grad_norm": 70.5,
"learning_rate": 4.42715404699739e-06,
"loss": 1.084,
"step": 4240
},
{
"epoch": 0.08009268942696594,
"grad_norm": 29.25,
"learning_rate": 4.448041775456919e-06,
"loss": 1.0646,
"step": 4260
},
{
"epoch": 0.080468711443055,
"grad_norm": 33.0,
"learning_rate": 4.4689295039164495e-06,
"loss": 1.0657,
"step": 4280
},
{
"epoch": 0.08084473345914403,
"grad_norm": 75.5,
"learning_rate": 4.489817232375979e-06,
"loss": 1.0553,
"step": 4300
},
{
"epoch": 0.08122075547523308,
"grad_norm": 49.75,
"learning_rate": 4.51070496083551e-06,
"loss": 1.0675,
"step": 4320
},
{
"epoch": 0.08159677749132212,
"grad_norm": 40.25,
"learning_rate": 4.531592689295039e-06,
"loss": 1.0539,
"step": 4340
},
{
"epoch": 0.08197279950741115,
"grad_norm": 40.5,
"learning_rate": 4.55248041775457e-06,
"loss": 1.0381,
"step": 4360
},
{
"epoch": 0.0823488215235002,
"grad_norm": 51.75,
"learning_rate": 4.573368146214099e-06,
"loss": 1.062,
"step": 4380
},
{
"epoch": 0.08272484353958924,
"grad_norm": 120.5,
"learning_rate": 4.59425587467363e-06,
"loss": 1.0553,
"step": 4400
},
{
"epoch": 0.08310086555567829,
"grad_norm": 51.75,
"learning_rate": 4.6151436031331595e-06,
"loss": 1.0386,
"step": 4420
},
{
"epoch": 0.08347688757176733,
"grad_norm": 29.125,
"learning_rate": 4.63603133159269e-06,
"loss": 1.0334,
"step": 4440
},
{
"epoch": 0.08385290958785636,
"grad_norm": 43.25,
"learning_rate": 4.65691906005222e-06,
"loss": 1.0358,
"step": 4460
},
{
"epoch": 0.08422893160394541,
"grad_norm": 34.5,
"learning_rate": 4.677806788511749e-06,
"loss": 1.0064,
"step": 4480
},
{
"epoch": 0.08460495362003445,
"grad_norm": 40.25,
"learning_rate": 4.6986945169712796e-06,
"loss": 1.0205,
"step": 4500
},
{
"epoch": 0.0849809756361235,
"grad_norm": 37.75,
"learning_rate": 4.71958224543081e-06,
"loss": 0.9937,
"step": 4520
},
{
"epoch": 0.08535699765221254,
"grad_norm": 41.25,
"learning_rate": 4.74046997389034e-06,
"loss": 1.0081,
"step": 4540
},
{
"epoch": 0.08573301966830157,
"grad_norm": 59.25,
"learning_rate": 4.7613577023498694e-06,
"loss": 0.9894,
"step": 4560
},
{
"epoch": 0.08610904168439062,
"grad_norm": 57.5,
"learning_rate": 4.7822454308094e-06,
"loss": 0.9949,
"step": 4580
},
{
"epoch": 0.08648506370047966,
"grad_norm": 46.5,
"learning_rate": 4.80313315926893e-06,
"loss": 1.0066,
"step": 4600
},
{
"epoch": 0.08686108571656871,
"grad_norm": 46.0,
"learning_rate": 4.82402088772846e-06,
"loss": 1.001,
"step": 4620
},
{
"epoch": 0.08723710773265775,
"grad_norm": 34.5,
"learning_rate": 4.8449086161879895e-06,
"loss": 0.9981,
"step": 4640
},
{
"epoch": 0.08761312974874678,
"grad_norm": 49.75,
"learning_rate": 4.86579634464752e-06,
"loss": 0.9975,
"step": 4660
},
{
"epoch": 0.08798915176483584,
"grad_norm": 25.125,
"learning_rate": 4.88668407310705e-06,
"loss": 0.9697,
"step": 4680
},
{
"epoch": 0.08836517378092487,
"grad_norm": 68.0,
"learning_rate": 4.90757180156658e-06,
"loss": 0.9851,
"step": 4700
},
{
"epoch": 0.08874119579701392,
"grad_norm": 122.0,
"learning_rate": 4.9284595300261105e-06,
"loss": 0.9624,
"step": 4720
},
{
"epoch": 0.08911721781310296,
"grad_norm": 68.5,
"learning_rate": 4.94934725848564e-06,
"loss": 0.9812,
"step": 4740
},
{
"epoch": 0.089493239829192,
"grad_norm": 43.5,
"learning_rate": 4.97023498694517e-06,
"loss": 0.9594,
"step": 4760
},
{
"epoch": 0.08986926184528105,
"grad_norm": 26.625,
"learning_rate": 4.9911227154047e-06,
"loss": 0.9582,
"step": 4780
},
{
"epoch": 0.09024528386137008,
"grad_norm": 30.375,
"learning_rate": 5.012010443864231e-06,
"loss": 0.9374,
"step": 4800
},
{
"epoch": 0.09062130587745912,
"grad_norm": 43.75,
"learning_rate": 5.03289817232376e-06,
"loss": 0.9564,
"step": 4820
},
{
"epoch": 0.09099732789354817,
"grad_norm": 61.5,
"learning_rate": 5.05378590078329e-06,
"loss": 0.9293,
"step": 4840
},
{
"epoch": 0.0913733499096372,
"grad_norm": 34.25,
"learning_rate": 5.07467362924282e-06,
"loss": 0.9345,
"step": 4860
},
{
"epoch": 0.09174937192572626,
"grad_norm": 27.375,
"learning_rate": 5.09556135770235e-06,
"loss": 0.9374,
"step": 4880
},
{
"epoch": 0.09212539394181529,
"grad_norm": 42.0,
"learning_rate": 5.11644908616188e-06,
"loss": 0.9305,
"step": 4900
},
{
"epoch": 0.09250141595790433,
"grad_norm": 27.875,
"learning_rate": 5.137336814621411e-06,
"loss": 0.9216,
"step": 4920
},
{
"epoch": 0.09287743797399338,
"grad_norm": 54.0,
"learning_rate": 5.1582245430809406e-06,
"loss": 0.9187,
"step": 4940
},
{
"epoch": 0.09325345999008242,
"grad_norm": 33.5,
"learning_rate": 5.179112271540471e-06,
"loss": 0.9057,
"step": 4960
},
{
"epoch": 0.09362948200617147,
"grad_norm": 34.5,
"learning_rate": 5.2e-06,
"loss": 0.8961,
"step": 4980
},
{
"epoch": 0.0940055040222605,
"grad_norm": 33.75,
"learning_rate": 5.2208877284595304e-06,
"loss": 0.9232,
"step": 5000
},
{
"epoch": 0.09438152603834954,
"grad_norm": 21.375,
"learning_rate": 5.241775456919061e-06,
"loss": 0.9002,
"step": 5020
},
{
"epoch": 0.09475754805443859,
"grad_norm": 37.0,
"learning_rate": 5.26266318537859e-06,
"loss": 0.9141,
"step": 5040
},
{
"epoch": 0.09513357007052763,
"grad_norm": 31.75,
"learning_rate": 5.28355091383812e-06,
"loss": 0.9062,
"step": 5060
},
{
"epoch": 0.09550959208661668,
"grad_norm": 25.0,
"learning_rate": 5.3044386422976505e-06,
"loss": 0.8911,
"step": 5080
},
{
"epoch": 0.09588561410270571,
"grad_norm": 39.75,
"learning_rate": 5.32532637075718e-06,
"loss": 0.8961,
"step": 5100
},
{
"epoch": 0.09626163611879475,
"grad_norm": 21.375,
"learning_rate": 5.346214099216711e-06,
"loss": 0.8682,
"step": 5120
},
{
"epoch": 0.0966376581348838,
"grad_norm": 23.625,
"learning_rate": 5.367101827676241e-06,
"loss": 0.8722,
"step": 5140
},
{
"epoch": 0.09701368015097284,
"grad_norm": 32.75,
"learning_rate": 5.387989556135771e-06,
"loss": 0.8664,
"step": 5160
},
{
"epoch": 0.09738970216706189,
"grad_norm": 22.875,
"learning_rate": 5.408877284595301e-06,
"loss": 0.8557,
"step": 5180
},
{
"epoch": 0.09776572418315092,
"grad_norm": 43.5,
"learning_rate": 5.429765013054831e-06,
"loss": 0.8546,
"step": 5200
},
{
"epoch": 0.09814174619923996,
"grad_norm": 22.375,
"learning_rate": 5.4506527415143605e-06,
"loss": 0.8568,
"step": 5220
},
{
"epoch": 0.09851776821532901,
"grad_norm": 24.75,
"learning_rate": 5.471540469973891e-06,
"loss": 0.8628,
"step": 5240
},
{
"epoch": 0.09889379023141805,
"grad_norm": 23.75,
"learning_rate": 5.49242819843342e-06,
"loss": 0.8456,
"step": 5260
},
{
"epoch": 0.0992698122475071,
"grad_norm": 23.5,
"learning_rate": 5.51331592689295e-06,
"loss": 0.8357,
"step": 5280
},
{
"epoch": 0.09964583426359613,
"grad_norm": 14.9375,
"learning_rate": 5.5342036553524814e-06,
"loss": 0.8189,
"step": 5300
},
{
"epoch": 0.10002185627968517,
"grad_norm": 40.25,
"learning_rate": 5.555091383812012e-06,
"loss": 0.8384,
"step": 5320
},
{
"epoch": 0.10039787829577422,
"grad_norm": 48.0,
"learning_rate": 5.575979112271541e-06,
"loss": 0.8441,
"step": 5340
},
{
"epoch": 0.10077390031186326,
"grad_norm": 25.0,
"learning_rate": 5.596866840731071e-06,
"loss": 0.81,
"step": 5360
},
{
"epoch": 0.10114992232795231,
"grad_norm": 53.25,
"learning_rate": 5.617754569190601e-06,
"loss": 0.846,
"step": 5380
},
{
"epoch": 0.10152594434404134,
"grad_norm": 21.25,
"learning_rate": 5.638642297650131e-06,
"loss": 0.8235,
"step": 5400
},
{
"epoch": 0.10190196636013038,
"grad_norm": 22.75,
"learning_rate": 5.659530026109661e-06,
"loss": 0.8416,
"step": 5420
},
{
"epoch": 0.10227798837621943,
"grad_norm": 10.4375,
"learning_rate": 5.6804177545691906e-06,
"loss": 0.8025,
"step": 5440
},
{
"epoch": 0.10265401039230847,
"grad_norm": 11.3125,
"learning_rate": 5.701305483028721e-06,
"loss": 0.8071,
"step": 5460
},
{
"epoch": 0.10303003240839752,
"grad_norm": 13.0,
"learning_rate": 5.72219321148825e-06,
"loss": 0.819,
"step": 5480
},
{
"epoch": 0.10340605442448655,
"grad_norm": 36.25,
"learning_rate": 5.743080939947781e-06,
"loss": 0.809,
"step": 5500
},
{
"epoch": 0.10378207644057559,
"grad_norm": 22.0,
"learning_rate": 5.7639686684073115e-06,
"loss": 0.8222,
"step": 5520
},
{
"epoch": 0.10415809845666464,
"grad_norm": 20.375,
"learning_rate": 5.784856396866842e-06,
"loss": 0.7892,
"step": 5540
},
{
"epoch": 0.10453412047275368,
"grad_norm": 23.875,
"learning_rate": 5.805744125326371e-06,
"loss": 0.8109,
"step": 5560
},
{
"epoch": 0.10491014248884273,
"grad_norm": 43.5,
"learning_rate": 5.826631853785901e-06,
"loss": 0.7978,
"step": 5580
},
{
"epoch": 0.10528616450493176,
"grad_norm": 22.25,
"learning_rate": 5.847519582245431e-06,
"loss": 0.7947,
"step": 5600
},
{
"epoch": 0.1056621865210208,
"grad_norm": 9.5,
"learning_rate": 5.868407310704961e-06,
"loss": 0.8045,
"step": 5620
},
{
"epoch": 0.10603820853710985,
"grad_norm": 12.375,
"learning_rate": 5.889295039164491e-06,
"loss": 0.8083,
"step": 5640
},
{
"epoch": 0.10641423055319889,
"grad_norm": 30.125,
"learning_rate": 5.910182767624021e-06,
"loss": 0.8052,
"step": 5660
},
{
"epoch": 0.10679025256928794,
"grad_norm": 20.125,
"learning_rate": 5.931070496083552e-06,
"loss": 0.7854,
"step": 5680
},
{
"epoch": 0.10716627458537697,
"grad_norm": 15.25,
"learning_rate": 5.951958224543082e-06,
"loss": 0.7947,
"step": 5700
},
{
"epoch": 0.10754229660146601,
"grad_norm": 8.5625,
"learning_rate": 5.972845953002611e-06,
"loss": 0.7992,
"step": 5720
},
{
"epoch": 0.10791831861755506,
"grad_norm": 20.625,
"learning_rate": 5.993733681462142e-06,
"loss": 0.7855,
"step": 5740
},
{
"epoch": 0.1082943406336441,
"grad_norm": 10.125,
"learning_rate": 6.014621409921672e-06,
"loss": 0.7857,
"step": 5760
},
{
"epoch": 0.10867036264973315,
"grad_norm": 33.5,
"learning_rate": 6.035509138381201e-06,
"loss": 0.7908,
"step": 5780
},
{
"epoch": 0.10904638466582219,
"grad_norm": 12.75,
"learning_rate": 6.0563968668407315e-06,
"loss": 0.771,
"step": 5800
},
{
"epoch": 0.10942240668191122,
"grad_norm": 23.5,
"learning_rate": 6.077284595300262e-06,
"loss": 0.7734,
"step": 5820
},
{
"epoch": 0.10979842869800027,
"grad_norm": 12.125,
"learning_rate": 6.098172323759791e-06,
"loss": 0.7723,
"step": 5840
},
{
"epoch": 0.11017445071408931,
"grad_norm": 14.0,
"learning_rate": 6.119060052219322e-06,
"loss": 0.7602,
"step": 5860
},
{
"epoch": 0.11055047273017835,
"grad_norm": 32.0,
"learning_rate": 6.139947780678852e-06,
"loss": 0.7809,
"step": 5880
},
{
"epoch": 0.1109264947462674,
"grad_norm": 15.4375,
"learning_rate": 6.160835509138382e-06,
"loss": 0.7905,
"step": 5900
},
{
"epoch": 0.11130251676235643,
"grad_norm": 24.875,
"learning_rate": 6.181723237597912e-06,
"loss": 0.7764,
"step": 5920
},
{
"epoch": 0.11167853877844548,
"grad_norm": 19.625,
"learning_rate": 6.202610966057441e-06,
"loss": 0.7877,
"step": 5940
},
{
"epoch": 0.11205456079453452,
"grad_norm": 13.5625,
"learning_rate": 6.223498694516972e-06,
"loss": 0.7779,
"step": 5960
},
{
"epoch": 0.11243058281062356,
"grad_norm": 15.9375,
"learning_rate": 6.244386422976502e-06,
"loss": 0.7711,
"step": 5980
},
{
"epoch": 0.1128066048267126,
"grad_norm": 7.90625,
"learning_rate": 6.265274151436031e-06,
"loss": 0.7661,
"step": 6000
},
{
"epoch": 0.11318262684280164,
"grad_norm": 7.75,
"learning_rate": 6.2861618798955615e-06,
"loss": 0.76,
"step": 6020
},
{
"epoch": 0.11355864885889069,
"grad_norm": 9.9375,
"learning_rate": 6.307049608355092e-06,
"loss": 0.7445,
"step": 6040
},
{
"epoch": 0.11393467087497973,
"grad_norm": 12.125,
"learning_rate": 6.327937336814622e-06,
"loss": 0.7601,
"step": 6060
},
{
"epoch": 0.11431069289106877,
"grad_norm": 10.5,
"learning_rate": 6.348825065274152e-06,
"loss": 0.7641,
"step": 6080
},
{
"epoch": 0.11468671490715782,
"grad_norm": 26.0,
"learning_rate": 6.3697127937336825e-06,
"loss": 0.7501,
"step": 6100
},
{
"epoch": 0.11506273692324685,
"grad_norm": 18.625,
"learning_rate": 6.390600522193212e-06,
"loss": 0.7625,
"step": 6120
},
{
"epoch": 0.1154387589393359,
"grad_norm": 12.375,
"learning_rate": 6.411488250652742e-06,
"loss": 0.752,
"step": 6140
},
{
"epoch": 0.11581478095542494,
"grad_norm": 7.21875,
"learning_rate": 6.432375979112272e-06,
"loss": 0.7546,
"step": 6160
},
{
"epoch": 0.11619080297151398,
"grad_norm": 13.5625,
"learning_rate": 6.453263707571802e-06,
"loss": 0.7533,
"step": 6180
},
{
"epoch": 0.11656682498760303,
"grad_norm": 7.78125,
"learning_rate": 6.474151436031332e-06,
"loss": 0.7427,
"step": 6200
},
{
"epoch": 0.11694284700369206,
"grad_norm": 7.34375,
"learning_rate": 6.495039164490861e-06,
"loss": 0.7569,
"step": 6220
},
{
"epoch": 0.11731886901978111,
"grad_norm": 11.5,
"learning_rate": 6.5159268929503924e-06,
"loss": 0.7568,
"step": 6240
},
{
"epoch": 0.11769489103587015,
"grad_norm": 9.125,
"learning_rate": 6.536814621409923e-06,
"loss": 0.7583,
"step": 6260
},
{
"epoch": 0.11807091305195919,
"grad_norm": 6.0,
"learning_rate": 6.557702349869453e-06,
"loss": 0.745,
"step": 6280
},
{
"epoch": 0.11844693506804824,
"grad_norm": 8.9375,
"learning_rate": 6.578590078328982e-06,
"loss": 0.7423,
"step": 6300
},
{
"epoch": 0.11882295708413727,
"grad_norm": 17.375,
"learning_rate": 6.5994778067885125e-06,
"loss": 0.7417,
"step": 6320
},
{
"epoch": 0.11919897910022632,
"grad_norm": 6.28125,
"learning_rate": 6.620365535248042e-06,
"loss": 0.7414,
"step": 6340
},
{
"epoch": 0.11957500111631536,
"grad_norm": 18.375,
"learning_rate": 6.641253263707572e-06,
"loss": 0.7489,
"step": 6360
},
{
"epoch": 0.1199510231324044,
"grad_norm": 18.75,
"learning_rate": 6.662140992167102e-06,
"loss": 0.748,
"step": 6380
},
{
"epoch": 0.12032704514849345,
"grad_norm": 7.46875,
"learning_rate": 6.683028720626632e-06,
"loss": 0.7259,
"step": 6400
},
{
"epoch": 0.12070306716458248,
"grad_norm": 6.78125,
"learning_rate": 6.703916449086162e-06,
"loss": 0.7454,
"step": 6420
},
{
"epoch": 0.12107908918067153,
"grad_norm": 12.0,
"learning_rate": 6.724804177545693e-06,
"loss": 0.7378,
"step": 6440
},
{
"epoch": 0.12145511119676057,
"grad_norm": 10.375,
"learning_rate": 6.7456919060052225e-06,
"loss": 0.7508,
"step": 6460
},
{
"epoch": 0.12183113321284961,
"grad_norm": 8.875,
"learning_rate": 6.766579634464753e-06,
"loss": 0.7262,
"step": 6480
},
{
"epoch": 0.12220715522893866,
"grad_norm": 10.375,
"learning_rate": 6.787467362924283e-06,
"loss": 0.7445,
"step": 6500
},
{
"epoch": 0.1225831772450277,
"grad_norm": 7.46875,
"learning_rate": 6.808355091383812e-06,
"loss": 0.7337,
"step": 6520
},
{
"epoch": 0.12295919926111674,
"grad_norm": 20.375,
"learning_rate": 6.829242819843343e-06,
"loss": 0.7305,
"step": 6540
},
{
"epoch": 0.12333522127720578,
"grad_norm": 6.875,
"learning_rate": 6.850130548302872e-06,
"loss": 0.7247,
"step": 6560
},
{
"epoch": 0.12371124329329482,
"grad_norm": 9.1875,
"learning_rate": 6.871018276762402e-06,
"loss": 0.7185,
"step": 6580
},
{
"epoch": 0.12408726530938387,
"grad_norm": 8.125,
"learning_rate": 6.8919060052219325e-06,
"loss": 0.7359,
"step": 6600
},
{
"epoch": 0.1244632873254729,
"grad_norm": 11.3125,
"learning_rate": 6.9127937336814636e-06,
"loss": 0.7158,
"step": 6620
},
{
"epoch": 0.12483930934156195,
"grad_norm": 17.75,
"learning_rate": 6.933681462140993e-06,
"loss": 0.7367,
"step": 6640
},
{
"epoch": 0.12521533135765098,
"grad_norm": 9.375,
"learning_rate": 6.954569190600523e-06,
"loss": 0.718,
"step": 6660
},
{
"epoch": 0.12559135337374003,
"grad_norm": 8.375,
"learning_rate": 6.975456919060053e-06,
"loss": 0.7174,
"step": 6680
},
{
"epoch": 0.12596737538982908,
"grad_norm": 6.90625,
"learning_rate": 6.996344647519583e-06,
"loss": 0.7147,
"step": 6700
},
{
"epoch": 0.12634339740591813,
"grad_norm": 7.40625,
"learning_rate": 7.017232375979113e-06,
"loss": 0.7146,
"step": 6720
},
{
"epoch": 0.12671941942200715,
"grad_norm": 9.0,
"learning_rate": 7.0381201044386425e-06,
"loss": 0.7205,
"step": 6740
},
{
"epoch": 0.1270954414380962,
"grad_norm": 4.9375,
"learning_rate": 7.059007832898173e-06,
"loss": 0.715,
"step": 6760
},
{
"epoch": 0.12747146345418525,
"grad_norm": 7.375,
"learning_rate": 7.079895561357703e-06,
"loss": 0.7167,
"step": 6780
},
{
"epoch": 0.12784748547027427,
"grad_norm": 6.3125,
"learning_rate": 7.100783289817232e-06,
"loss": 0.7125,
"step": 6800
},
{
"epoch": 0.12822350748636333,
"grad_norm": 6.5625,
"learning_rate": 7.121671018276763e-06,
"loss": 0.7231,
"step": 6820
},
{
"epoch": 0.12859952950245238,
"grad_norm": 7.96875,
"learning_rate": 7.142558746736294e-06,
"loss": 0.7154,
"step": 6840
},
{
"epoch": 0.1289755515185414,
"grad_norm": 6.46875,
"learning_rate": 7.163446475195823e-06,
"loss": 0.7118,
"step": 6860
},
{
"epoch": 0.12935157353463045,
"grad_norm": 12.8125,
"learning_rate": 7.184334203655353e-06,
"loss": 0.6987,
"step": 6880
},
{
"epoch": 0.1297275955507195,
"grad_norm": 6.375,
"learning_rate": 7.205221932114883e-06,
"loss": 0.7044,
"step": 6900
},
{
"epoch": 0.13010361756680855,
"grad_norm": 6.6875,
"learning_rate": 7.226109660574413e-06,
"loss": 0.6975,
"step": 6920
},
{
"epoch": 0.13047963958289757,
"grad_norm": 5.875,
"learning_rate": 7.246997389033943e-06,
"loss": 0.7044,
"step": 6940
},
{
"epoch": 0.13085566159898662,
"grad_norm": 10.8125,
"learning_rate": 7.2678851174934725e-06,
"loss": 0.6952,
"step": 6960
},
{
"epoch": 0.13123168361507567,
"grad_norm": 5.25,
"learning_rate": 7.288772845953003e-06,
"loss": 0.7118,
"step": 6980
},
{
"epoch": 0.1316077056311647,
"grad_norm": 5.28125,
"learning_rate": 7.309660574412534e-06,
"loss": 0.713,
"step": 7000
},
{
"epoch": 0.13198372764725375,
"grad_norm": 8.0625,
"learning_rate": 7.330548302872063e-06,
"loss": 0.7068,
"step": 7020
},
{
"epoch": 0.1323597496633428,
"grad_norm": 6.21875,
"learning_rate": 7.3514360313315935e-06,
"loss": 0.7073,
"step": 7040
},
{
"epoch": 0.13273577167943182,
"grad_norm": 9.9375,
"learning_rate": 7.372323759791124e-06,
"loss": 0.7097,
"step": 7060
},
{
"epoch": 0.13311179369552087,
"grad_norm": 5.09375,
"learning_rate": 7.393211488250653e-06,
"loss": 0.6954,
"step": 7080
},
{
"epoch": 0.13348781571160992,
"grad_norm": 5.0,
"learning_rate": 7.414099216710183e-06,
"loss": 0.7081,
"step": 7100
},
{
"epoch": 0.13386383772769897,
"grad_norm": 4.6875,
"learning_rate": 7.4349869451697136e-06,
"loss": 0.702,
"step": 7120
},
{
"epoch": 0.134239859743788,
"grad_norm": 5.09375,
"learning_rate": 7.455874673629243e-06,
"loss": 0.7005,
"step": 7140
},
{
"epoch": 0.13461588175987704,
"grad_norm": 7.4375,
"learning_rate": 7.476762402088773e-06,
"loss": 0.6976,
"step": 7160
},
{
"epoch": 0.1349919037759661,
"grad_norm": 4.46875,
"learning_rate": 7.497650130548304e-06,
"loss": 0.6871,
"step": 7180
},
{
"epoch": 0.13536792579205512,
"grad_norm": 4.71875,
"learning_rate": 7.518537859007834e-06,
"loss": 0.6885,
"step": 7200
},
{
"epoch": 0.13574394780814417,
"grad_norm": 4.65625,
"learning_rate": 7.539425587467364e-06,
"loss": 0.6964,
"step": 7220
},
{
"epoch": 0.13611996982423322,
"grad_norm": 8.6875,
"learning_rate": 7.560313315926894e-06,
"loss": 0.7011,
"step": 7240
},
{
"epoch": 0.13649599184032224,
"grad_norm": 8.75,
"learning_rate": 7.5812010443864235e-06,
"loss": 0.687,
"step": 7260
},
{
"epoch": 0.1368720138564113,
"grad_norm": 4.03125,
"learning_rate": 7.602088772845954e-06,
"loss": 0.6915,
"step": 7280
},
{
"epoch": 0.13724803587250034,
"grad_norm": 5.84375,
"learning_rate": 7.622976501305483e-06,
"loss": 0.6777,
"step": 7300
},
{
"epoch": 0.1376240578885894,
"grad_norm": 4.78125,
"learning_rate": 7.643864229765013e-06,
"loss": 0.6925,
"step": 7320
},
{
"epoch": 0.1380000799046784,
"grad_norm": 13.3125,
"learning_rate": 7.664751958224544e-06,
"loss": 0.6797,
"step": 7340
},
{
"epoch": 0.13837610192076746,
"grad_norm": 7.5625,
"learning_rate": 7.685639686684074e-06,
"loss": 0.6898,
"step": 7360
},
{
"epoch": 0.13875212393685651,
"grad_norm": 4.34375,
"learning_rate": 7.706527415143604e-06,
"loss": 0.689,
"step": 7380
},
{
"epoch": 0.13912814595294554,
"grad_norm": 4.0,
"learning_rate": 7.727415143603134e-06,
"loss": 0.6946,
"step": 7400
},
{
"epoch": 0.1395041679690346,
"grad_norm": 5.53125,
"learning_rate": 7.748302872062665e-06,
"loss": 0.6817,
"step": 7420
},
{
"epoch": 0.13988018998512364,
"grad_norm": 6.03125,
"learning_rate": 7.769190600522193e-06,
"loss": 0.6864,
"step": 7440
},
{
"epoch": 0.14025621200121266,
"grad_norm": 4.84375,
"learning_rate": 7.790078328981723e-06,
"loss": 0.6869,
"step": 7460
},
{
"epoch": 0.1406322340173017,
"grad_norm": 4.4375,
"learning_rate": 7.810966057441254e-06,
"loss": 0.6908,
"step": 7480
},
{
"epoch": 0.14100825603339076,
"grad_norm": 8.625,
"learning_rate": 7.831853785900784e-06,
"loss": 0.6784,
"step": 7500
},
{
"epoch": 0.14138427804947978,
"grad_norm": 3.25,
"learning_rate": 7.852741514360314e-06,
"loss": 0.6762,
"step": 7520
},
{
"epoch": 0.14176030006556883,
"grad_norm": 8.4375,
"learning_rate": 7.873629242819844e-06,
"loss": 0.6726,
"step": 7540
},
{
"epoch": 0.14213632208165788,
"grad_norm": 4.1875,
"learning_rate": 7.894516971279375e-06,
"loss": 0.6635,
"step": 7560
},
{
"epoch": 0.14251234409774693,
"grad_norm": 5.40625,
"learning_rate": 7.915404699738905e-06,
"loss": 0.6875,
"step": 7580
},
{
"epoch": 0.14288836611383596,
"grad_norm": 4.5625,
"learning_rate": 7.936292428198435e-06,
"loss": 0.6747,
"step": 7600
},
{
"epoch": 0.143264388129925,
"grad_norm": 3.53125,
"learning_rate": 7.957180156657964e-06,
"loss": 0.6749,
"step": 7620
},
{
"epoch": 0.14364041014601406,
"grad_norm": 4.125,
"learning_rate": 7.978067885117494e-06,
"loss": 0.6701,
"step": 7640
},
{
"epoch": 0.14401643216210308,
"grad_norm": 5.59375,
"learning_rate": 7.998955613577024e-06,
"loss": 0.6641,
"step": 7660
},
{
"epoch": 0.14439245417819213,
"grad_norm": 10.75,
"learning_rate": 8.019843342036554e-06,
"loss": 0.661,
"step": 7680
},
{
"epoch": 0.14476847619428118,
"grad_norm": 4.84375,
"learning_rate": 8.040731070496085e-06,
"loss": 0.6687,
"step": 7700
},
{
"epoch": 0.1451444982103702,
"grad_norm": 4.875,
"learning_rate": 8.061618798955613e-06,
"loss": 0.6559,
"step": 7720
},
{
"epoch": 0.14552052022645925,
"grad_norm": 10.6875,
"learning_rate": 8.082506527415143e-06,
"loss": 0.6601,
"step": 7740
},
{
"epoch": 0.1458965422425483,
"grad_norm": 4.46875,
"learning_rate": 8.103394255874675e-06,
"loss": 0.6761,
"step": 7760
},
{
"epoch": 0.14627256425863736,
"grad_norm": 3.609375,
"learning_rate": 8.124281984334205e-06,
"loss": 0.6663,
"step": 7780
},
{
"epoch": 0.14664858627472638,
"grad_norm": 4.0625,
"learning_rate": 8.145169712793734e-06,
"loss": 0.6693,
"step": 7800
},
{
"epoch": 0.14702460829081543,
"grad_norm": 7.25,
"learning_rate": 8.166057441253264e-06,
"loss": 0.6543,
"step": 7820
},
{
"epoch": 0.14740063030690448,
"grad_norm": 4.59375,
"learning_rate": 8.186945169712795e-06,
"loss": 0.6718,
"step": 7840
},
{
"epoch": 0.1477766523229935,
"grad_norm": 4.5,
"learning_rate": 8.207832898172325e-06,
"loss": 0.6534,
"step": 7860
},
{
"epoch": 0.14815267433908255,
"grad_norm": 5.21875,
"learning_rate": 8.228720626631855e-06,
"loss": 0.6576,
"step": 7880
},
{
"epoch": 0.1485286963551716,
"grad_norm": 3.0,
"learning_rate": 8.249608355091384e-06,
"loss": 0.646,
"step": 7900
},
{
"epoch": 0.14890471837126062,
"grad_norm": 4.21875,
"learning_rate": 8.270496083550914e-06,
"loss": 0.6576,
"step": 7920
},
{
"epoch": 0.14928074038734968,
"grad_norm": 4.875,
"learning_rate": 8.291383812010446e-06,
"loss": 0.6728,
"step": 7940
},
{
"epoch": 0.14965676240343873,
"grad_norm": 6.1875,
"learning_rate": 8.312271540469974e-06,
"loss": 0.6676,
"step": 7960
},
{
"epoch": 0.15003278441952778,
"grad_norm": 11.6875,
"learning_rate": 8.333159268929504e-06,
"loss": 0.6537,
"step": 7980
},
{
"epoch": 0.1504088064356168,
"grad_norm": 5.40625,
"learning_rate": 8.354046997389035e-06,
"loss": 0.6551,
"step": 8000
},
{
"epoch": 0.15078482845170585,
"grad_norm": 3.703125,
"learning_rate": 8.374934725848565e-06,
"loss": 0.6531,
"step": 8020
},
{
"epoch": 0.1511608504677949,
"grad_norm": 4.0,
"learning_rate": 8.395822454308095e-06,
"loss": 0.654,
"step": 8040
},
{
"epoch": 0.15153687248388392,
"grad_norm": 7.3125,
"learning_rate": 8.416710182767624e-06,
"loss": 0.658,
"step": 8060
},
{
"epoch": 0.15191289449997297,
"grad_norm": 3.71875,
"learning_rate": 8.437597911227154e-06,
"loss": 0.6516,
"step": 8080
},
{
"epoch": 0.15228891651606202,
"grad_norm": 6.34375,
"learning_rate": 8.458485639686684e-06,
"loss": 0.6584,
"step": 8100
},
{
"epoch": 0.15266493853215105,
"grad_norm": 3.171875,
"learning_rate": 8.479373368146214e-06,
"loss": 0.6544,
"step": 8120
},
{
"epoch": 0.1530409605482401,
"grad_norm": 3.765625,
"learning_rate": 8.500261096605745e-06,
"loss": 0.6579,
"step": 8140
},
{
"epoch": 0.15341698256432915,
"grad_norm": 3.03125,
"learning_rate": 8.521148825065275e-06,
"loss": 0.6413,
"step": 8160
},
{
"epoch": 0.1537930045804182,
"grad_norm": 2.734375,
"learning_rate": 8.542036553524805e-06,
"loss": 0.6537,
"step": 8180
},
{
"epoch": 0.15416902659650722,
"grad_norm": 2.53125,
"learning_rate": 8.562924281984335e-06,
"loss": 0.6568,
"step": 8200
},
{
"epoch": 0.15454504861259627,
"grad_norm": 3.203125,
"learning_rate": 8.583812010443866e-06,
"loss": 0.636,
"step": 8220
},
{
"epoch": 0.15492107062868532,
"grad_norm": 2.296875,
"learning_rate": 8.604699738903394e-06,
"loss": 0.6439,
"step": 8240
},
{
"epoch": 0.15529709264477434,
"grad_norm": 3.296875,
"learning_rate": 8.625587467362924e-06,
"loss": 0.6409,
"step": 8260
},
{
"epoch": 0.1556731146608634,
"grad_norm": 3.015625,
"learning_rate": 8.646475195822455e-06,
"loss": 0.6533,
"step": 8280
},
{
"epoch": 0.15604913667695244,
"grad_norm": 3.3125,
"learning_rate": 8.667362924281985e-06,
"loss": 0.648,
"step": 8300
},
{
"epoch": 0.15642515869304147,
"grad_norm": 2.5625,
"learning_rate": 8.688250652741515e-06,
"loss": 0.6388,
"step": 8320
},
{
"epoch": 0.15680118070913052,
"grad_norm": 5.125,
"learning_rate": 8.709138381201045e-06,
"loss": 0.6439,
"step": 8340
},
{
"epoch": 0.15717720272521957,
"grad_norm": 2.359375,
"learning_rate": 8.730026109660576e-06,
"loss": 0.6397,
"step": 8360
},
{
"epoch": 0.15755322474130862,
"grad_norm": 2.5625,
"learning_rate": 8.750913838120106e-06,
"loss": 0.6342,
"step": 8380
},
{
"epoch": 0.15792924675739764,
"grad_norm": 2.46875,
"learning_rate": 8.771801566579634e-06,
"loss": 0.6388,
"step": 8400
},
{
"epoch": 0.1583052687734867,
"grad_norm": 2.953125,
"learning_rate": 8.792689295039165e-06,
"loss": 0.6361,
"step": 8420
},
{
"epoch": 0.15868129078957574,
"grad_norm": 3.96875,
"learning_rate": 8.813577023498695e-06,
"loss": 0.6334,
"step": 8440
},
{
"epoch": 0.15905731280566476,
"grad_norm": 2.546875,
"learning_rate": 8.834464751958225e-06,
"loss": 0.6556,
"step": 8460
},
{
"epoch": 0.1594333348217538,
"grad_norm": 3.015625,
"learning_rate": 8.855352480417755e-06,
"loss": 0.6357,
"step": 8480
},
{
"epoch": 0.15980935683784286,
"grad_norm": 3.0,
"learning_rate": 8.876240208877286e-06,
"loss": 0.6383,
"step": 8500
},
{
"epoch": 0.1601853788539319,
"grad_norm": 2.625,
"learning_rate": 8.897127937336816e-06,
"loss": 0.6429,
"step": 8520
},
{
"epoch": 0.16056140087002094,
"grad_norm": 3.609375,
"learning_rate": 8.918015665796346e-06,
"loss": 0.6338,
"step": 8540
},
{
"epoch": 0.16093742288611,
"grad_norm": 2.171875,
"learning_rate": 8.938903394255876e-06,
"loss": 0.649,
"step": 8560
},
{
"epoch": 0.161313444902199,
"grad_norm": 3.984375,
"learning_rate": 8.959791122715405e-06,
"loss": 0.6272,
"step": 8580
},
{
"epoch": 0.16168946691828806,
"grad_norm": 3.015625,
"learning_rate": 8.980678851174935e-06,
"loss": 0.6372,
"step": 8600
},
{
"epoch": 0.1620654889343771,
"grad_norm": 2.3125,
"learning_rate": 9.001566579634465e-06,
"loss": 0.6178,
"step": 8620
},
{
"epoch": 0.16244151095046616,
"grad_norm": 4.875,
"learning_rate": 9.022454308093996e-06,
"loss": 0.6354,
"step": 8640
},
{
"epoch": 0.16281753296655518,
"grad_norm": 2.671875,
"learning_rate": 9.043342036553526e-06,
"loss": 0.6427,
"step": 8660
},
{
"epoch": 0.16319355498264423,
"grad_norm": 2.828125,
"learning_rate": 9.064229765013054e-06,
"loss": 0.6283,
"step": 8680
},
{
"epoch": 0.16356957699873328,
"grad_norm": 2.703125,
"learning_rate": 9.085117493472586e-06,
"loss": 0.6364,
"step": 8700
},
{
"epoch": 0.1639455990148223,
"grad_norm": 2.21875,
"learning_rate": 9.106005221932116e-06,
"loss": 0.6289,
"step": 8720
},
{
"epoch": 0.16432162103091136,
"grad_norm": 2.484375,
"learning_rate": 9.126892950391647e-06,
"loss": 0.6357,
"step": 8740
},
{
"epoch": 0.1646976430470004,
"grad_norm": 4.0625,
"learning_rate": 9.147780678851175e-06,
"loss": 0.6392,
"step": 8760
},
{
"epoch": 0.16507366506308943,
"grad_norm": 1.6953125,
"learning_rate": 9.168668407310705e-06,
"loss": 0.6211,
"step": 8780
},
{
"epoch": 0.16544968707917848,
"grad_norm": 2.34375,
"learning_rate": 9.189556135770236e-06,
"loss": 0.6395,
"step": 8800
},
{
"epoch": 0.16582570909526753,
"grad_norm": 2.765625,
"learning_rate": 9.210443864229766e-06,
"loss": 0.625,
"step": 8820
},
{
"epoch": 0.16620173111135658,
"grad_norm": 2.296875,
"learning_rate": 9.231331592689296e-06,
"loss": 0.6307,
"step": 8840
},
{
"epoch": 0.1665777531274456,
"grad_norm": 2.0625,
"learning_rate": 9.252219321148825e-06,
"loss": 0.6201,
"step": 8860
},
{
"epoch": 0.16695377514353466,
"grad_norm": 1.96875,
"learning_rate": 9.273107049608357e-06,
"loss": 0.6211,
"step": 8880
},
{
"epoch": 0.1673297971596237,
"grad_norm": 1.9765625,
"learning_rate": 9.293994778067887e-06,
"loss": 0.6242,
"step": 8900
},
{
"epoch": 0.16770581917571273,
"grad_norm": 2.0625,
"learning_rate": 9.314882506527415e-06,
"loss": 0.6216,
"step": 8920
},
{
"epoch": 0.16808184119180178,
"grad_norm": 2.21875,
"learning_rate": 9.335770234986946e-06,
"loss": 0.6271,
"step": 8940
},
{
"epoch": 0.16845786320789083,
"grad_norm": 1.7265625,
"learning_rate": 9.356657963446476e-06,
"loss": 0.6366,
"step": 8960
},
{
"epoch": 0.16883388522397985,
"grad_norm": 2.234375,
"learning_rate": 9.377545691906006e-06,
"loss": 0.6155,
"step": 8980
},
{
"epoch": 0.1692099072400689,
"grad_norm": 2.421875,
"learning_rate": 9.398433420365536e-06,
"loss": 0.6162,
"step": 9000
},
{
"epoch": 0.16958592925615795,
"grad_norm": 3.578125,
"learning_rate": 9.419321148825065e-06,
"loss": 0.6269,
"step": 9020
},
{
"epoch": 0.169961951272247,
"grad_norm": 1.953125,
"learning_rate": 9.440208877284595e-06,
"loss": 0.6306,
"step": 9040
},
{
"epoch": 0.17033797328833603,
"grad_norm": 1.7265625,
"learning_rate": 9.461096605744125e-06,
"loss": 0.6247,
"step": 9060
},
{
"epoch": 0.17071399530442508,
"grad_norm": 1.9140625,
"learning_rate": 9.481984334203657e-06,
"loss": 0.613,
"step": 9080
},
{
"epoch": 0.17109001732051413,
"grad_norm": 3.953125,
"learning_rate": 9.502872062663186e-06,
"loss": 0.6187,
"step": 9100
},
{
"epoch": 0.17146603933660315,
"grad_norm": 2.875,
"learning_rate": 9.523759791122716e-06,
"loss": 0.6215,
"step": 9120
},
{
"epoch": 0.1718420613526922,
"grad_norm": 2.484375,
"learning_rate": 9.544647519582246e-06,
"loss": 0.6234,
"step": 9140
},
{
"epoch": 0.17221808336878125,
"grad_norm": 2.421875,
"learning_rate": 9.565535248041777e-06,
"loss": 0.618,
"step": 9160
},
{
"epoch": 0.17259410538487027,
"grad_norm": 1.7734375,
"learning_rate": 9.586422976501307e-06,
"loss": 0.6134,
"step": 9180
},
{
"epoch": 0.17297012740095932,
"grad_norm": 2.296875,
"learning_rate": 9.607310704960835e-06,
"loss": 0.6127,
"step": 9200
},
{
"epoch": 0.17334614941704837,
"grad_norm": 2.625,
"learning_rate": 9.628198433420366e-06,
"loss": 0.6139,
"step": 9220
},
{
"epoch": 0.17372217143313742,
"grad_norm": 2.875,
"learning_rate": 9.649086161879896e-06,
"loss": 0.6121,
"step": 9240
},
{
"epoch": 0.17409819344922645,
"grad_norm": 2.09375,
"learning_rate": 9.669973890339426e-06,
"loss": 0.6126,
"step": 9260
},
{
"epoch": 0.1744742154653155,
"grad_norm": 1.2265625,
"learning_rate": 9.690861618798956e-06,
"loss": 0.6162,
"step": 9280
},
{
"epoch": 0.17485023748140455,
"grad_norm": 2.796875,
"learning_rate": 9.711749347258487e-06,
"loss": 0.6072,
"step": 9300
},
{
"epoch": 0.17522625949749357,
"grad_norm": 1.5234375,
"learning_rate": 9.732637075718017e-06,
"loss": 0.6122,
"step": 9320
},
{
"epoch": 0.17560228151358262,
"grad_norm": 1.515625,
"learning_rate": 9.753524804177547e-06,
"loss": 0.6053,
"step": 9340
},
{
"epoch": 0.17597830352967167,
"grad_norm": 1.4765625,
"learning_rate": 9.774412532637077e-06,
"loss": 0.6149,
"step": 9360
},
{
"epoch": 0.1763543255457607,
"grad_norm": 1.5,
"learning_rate": 9.795300261096606e-06,
"loss": 0.6229,
"step": 9380
},
{
"epoch": 0.17673034756184974,
"grad_norm": 1.609375,
"learning_rate": 9.816187989556136e-06,
"loss": 0.6134,
"step": 9400
},
{
"epoch": 0.1771063695779388,
"grad_norm": 2.140625,
"learning_rate": 9.837075718015666e-06,
"loss": 0.6155,
"step": 9420
},
{
"epoch": 0.17748239159402784,
"grad_norm": 1.6015625,
"learning_rate": 9.857963446475197e-06,
"loss": 0.6042,
"step": 9440
},
{
"epoch": 0.17785841361011687,
"grad_norm": 1.78125,
"learning_rate": 9.878851174934727e-06,
"loss": 0.6182,
"step": 9460
},
{
"epoch": 0.17823443562620592,
"grad_norm": 1.3515625,
"learning_rate": 9.899738903394257e-06,
"loss": 0.6036,
"step": 9480
},
{
"epoch": 0.17861045764229497,
"grad_norm": 1.984375,
"learning_rate": 9.920626631853787e-06,
"loss": 0.6027,
"step": 9500
},
{
"epoch": 0.178986479658384,
"grad_norm": 1.5,
"learning_rate": 9.941514360313318e-06,
"loss": 0.6089,
"step": 9520
},
{
"epoch": 0.17936250167447304,
"grad_norm": 1.7734375,
"learning_rate": 9.962402088772846e-06,
"loss": 0.604,
"step": 9540
},
{
"epoch": 0.1797385236905621,
"grad_norm": 1.28125,
"learning_rate": 9.983289817232376e-06,
"loss": 0.6004,
"step": 9560
},
{
"epoch": 0.1801145457066511,
"grad_norm": 1.5234375,
"learning_rate": 9.999999995880232e-06,
"loss": 0.6019,
"step": 9580
},
{
"epoch": 0.18049056772274016,
"grad_norm": 1.421875,
"learning_rate": 9.999999851688318e-06,
"loss": 0.6145,
"step": 9600
},
{
"epoch": 0.18086658973882921,
"grad_norm": 1.78125,
"learning_rate": 9.999999501507959e-06,
"loss": 0.6105,
"step": 9620
},
{
"epoch": 0.18124261175491824,
"grad_norm": 2.109375,
"learning_rate": 9.999998945339171e-06,
"loss": 0.6139,
"step": 9640
},
{
"epoch": 0.1816186337710073,
"grad_norm": 1.8671875,
"learning_rate": 9.999998183181976e-06,
"loss": 0.6122,
"step": 9660
},
{
"epoch": 0.18199465578709634,
"grad_norm": 1.6171875,
"learning_rate": 9.999997215036408e-06,
"loss": 0.6095,
"step": 9680
},
{
"epoch": 0.1823706778031854,
"grad_norm": 1.359375,
"learning_rate": 9.999996040902503e-06,
"loss": 0.5928,
"step": 9700
},
{
"epoch": 0.1827466998192744,
"grad_norm": 1.21875,
"learning_rate": 9.999994660780312e-06,
"loss": 0.6034,
"step": 9720
},
{
"epoch": 0.18312272183536346,
"grad_norm": 1.3984375,
"learning_rate": 9.99999307466989e-06,
"loss": 0.6012,
"step": 9740
},
{
"epoch": 0.1834987438514525,
"grad_norm": 2.09375,
"learning_rate": 9.999991282571304e-06,
"loss": 0.605,
"step": 9760
},
{
"epoch": 0.18387476586754153,
"grad_norm": 1.421875,
"learning_rate": 9.999989284484629e-06,
"loss": 0.6093,
"step": 9780
},
{
"epoch": 0.18425078788363058,
"grad_norm": 1.4765625,
"learning_rate": 9.999987080409942e-06,
"loss": 0.6032,
"step": 9800
},
{
"epoch": 0.18462680989971963,
"grad_norm": 1.2421875,
"learning_rate": 9.99998467034734e-06,
"loss": 0.6019,
"step": 9820
},
{
"epoch": 0.18500283191580866,
"grad_norm": 1.84375,
"learning_rate": 9.99998205429692e-06,
"loss": 0.6006,
"step": 9840
},
{
"epoch": 0.1853788539318977,
"grad_norm": 1.328125,
"learning_rate": 9.999979232258787e-06,
"loss": 0.586,
"step": 9860
},
{
"epoch": 0.18575487594798676,
"grad_norm": 1.375,
"learning_rate": 9.999976204233062e-06,
"loss": 0.614,
"step": 9880
},
{
"epoch": 0.1861308979640758,
"grad_norm": 1.2421875,
"learning_rate": 9.999972970219865e-06,
"loss": 0.6049,
"step": 9900
},
{
"epoch": 0.18650691998016483,
"grad_norm": 1.3984375,
"learning_rate": 9.999969530219333e-06,
"loss": 0.6048,
"step": 9920
},
{
"epoch": 0.18688294199625388,
"grad_norm": 1.3828125,
"learning_rate": 9.999965884231607e-06,
"loss": 0.608,
"step": 9940
},
{
"epoch": 0.18725896401234293,
"grad_norm": 1.609375,
"learning_rate": 9.999962032256836e-06,
"loss": 0.6017,
"step": 9960
},
{
"epoch": 0.18763498602843195,
"grad_norm": 1.265625,
"learning_rate": 9.99995797429518e-06,
"loss": 0.592,
"step": 9980
},
{
"epoch": 0.188011008044521,
"grad_norm": 1.421875,
"learning_rate": 9.999953710346804e-06,
"loss": 0.602,
"step": 10000
},
{
"epoch": 0.18838703006061006,
"grad_norm": 1.3125,
"learning_rate": 9.999949240411886e-06,
"loss": 0.5894,
"step": 10020
},
{
"epoch": 0.18876305207669908,
"grad_norm": 1.375,
"learning_rate": 9.99994456449061e-06,
"loss": 0.5908,
"step": 10040
},
{
"epoch": 0.18913907409278813,
"grad_norm": 1.2890625,
"learning_rate": 9.999939682583166e-06,
"loss": 0.5914,
"step": 10060
},
{
"epoch": 0.18951509610887718,
"grad_norm": 1.6328125,
"learning_rate": 9.999934594689759e-06,
"loss": 0.5951,
"step": 10080
},
{
"epoch": 0.18989111812496623,
"grad_norm": 1.1796875,
"learning_rate": 9.999929300810595e-06,
"loss": 0.5925,
"step": 10100
},
{
"epoch": 0.19026714014105525,
"grad_norm": 1.1953125,
"learning_rate": 9.999923800945895e-06,
"loss": 0.5982,
"step": 10120
},
{
"epoch": 0.1906431621571443,
"grad_norm": 1.1640625,
"learning_rate": 9.999918095095884e-06,
"loss": 0.6023,
"step": 10140
},
{
"epoch": 0.19101918417323335,
"grad_norm": 1.171875,
"learning_rate": 9.999912183260798e-06,
"loss": 0.5926,
"step": 10160
},
{
"epoch": 0.19139520618932238,
"grad_norm": 1.28125,
"learning_rate": 9.999906065440878e-06,
"loss": 0.5869,
"step": 10180
},
{
"epoch": 0.19177122820541143,
"grad_norm": 1.296875,
"learning_rate": 9.999899741636381e-06,
"loss": 0.5965,
"step": 10200
},
{
"epoch": 0.19214725022150048,
"grad_norm": 1.0234375,
"learning_rate": 9.999893211847563e-06,
"loss": 0.601,
"step": 10220
},
{
"epoch": 0.1925232722375895,
"grad_norm": 1.0390625,
"learning_rate": 9.999886476074694e-06,
"loss": 0.5916,
"step": 10240
},
{
"epoch": 0.19289929425367855,
"grad_norm": 1.078125,
"learning_rate": 9.999879534318051e-06,
"loss": 0.5947,
"step": 10260
},
{
"epoch": 0.1932753162697676,
"grad_norm": 1.109375,
"learning_rate": 9.999872386577923e-06,
"loss": 0.5979,
"step": 10280
},
{
"epoch": 0.19365133828585665,
"grad_norm": 1.21875,
"learning_rate": 9.9998650328546e-06,
"loss": 0.5927,
"step": 10300
},
{
"epoch": 0.19402736030194567,
"grad_norm": 3.171875,
"learning_rate": 9.99985747314839e-06,
"loss": 0.5999,
"step": 10320
},
{
"epoch": 0.19440338231803472,
"grad_norm": 1.1640625,
"learning_rate": 9.999849707459601e-06,
"loss": 0.6072,
"step": 10340
},
{
"epoch": 0.19477940433412377,
"grad_norm": 1.6171875,
"learning_rate": 9.999841735788555e-06,
"loss": 0.601,
"step": 10360
},
{
"epoch": 0.1951554263502128,
"grad_norm": 1.484375,
"learning_rate": 9.999833558135578e-06,
"loss": 0.5996,
"step": 10380
},
{
"epoch": 0.19553144836630185,
"grad_norm": 1.015625,
"learning_rate": 9.999825174501009e-06,
"loss": 0.5907,
"step": 10400
},
{
"epoch": 0.1959074703823909,
"grad_norm": 1.265625,
"learning_rate": 9.999816584885192e-06,
"loss": 0.5888,
"step": 10420
},
{
"epoch": 0.19628349239847992,
"grad_norm": 1.515625,
"learning_rate": 9.99980778928848e-06,
"loss": 0.5894,
"step": 10440
},
{
"epoch": 0.19665951441456897,
"grad_norm": 2.25,
"learning_rate": 9.999798787711239e-06,
"loss": 0.5938,
"step": 10460
},
{
"epoch": 0.19703553643065802,
"grad_norm": 1.1796875,
"learning_rate": 9.999789580153835e-06,
"loss": 0.5832,
"step": 10480
},
{
"epoch": 0.19741155844674707,
"grad_norm": 1.1015625,
"learning_rate": 9.999780166616652e-06,
"loss": 0.579,
"step": 10500
},
{
"epoch": 0.1977875804628361,
"grad_norm": 1.015625,
"learning_rate": 9.999770547100073e-06,
"loss": 0.596,
"step": 10520
},
{
"epoch": 0.19816360247892514,
"grad_norm": 1.453125,
"learning_rate": 9.9997607216045e-06,
"loss": 0.5934,
"step": 10540
},
{
"epoch": 0.1985396244950142,
"grad_norm": 1.4140625,
"learning_rate": 9.999750690130335e-06,
"loss": 0.5884,
"step": 10560
},
{
"epoch": 0.19891564651110322,
"grad_norm": 1.390625,
"learning_rate": 9.99974045267799e-06,
"loss": 0.5949,
"step": 10580
},
{
"epoch": 0.19929166852719227,
"grad_norm": 1.2265625,
"learning_rate": 9.999730009247888e-06,
"loss": 0.5935,
"step": 10600
},
{
"epoch": 0.19966769054328132,
"grad_norm": 1.265625,
"learning_rate": 9.999719359840459e-06,
"loss": 0.5904,
"step": 10620
},
{
"epoch": 0.20004371255937034,
"grad_norm": 1.296875,
"learning_rate": 9.99970850445614e-06,
"loss": 0.5811,
"step": 10640
},
{
"epoch": 0.2004197345754594,
"grad_norm": 0.98828125,
"learning_rate": 9.999697443095383e-06,
"loss": 0.584,
"step": 10660
},
{
"epoch": 0.20079575659154844,
"grad_norm": 1.125,
"learning_rate": 9.999686175758639e-06,
"loss": 0.586,
"step": 10680
},
{
"epoch": 0.20117177860763746,
"grad_norm": 1.0234375,
"learning_rate": 9.999674702446375e-06,
"loss": 0.5924,
"step": 10700
},
{
"epoch": 0.20154780062372651,
"grad_norm": 1.1796875,
"learning_rate": 9.999663023159062e-06,
"loss": 0.5876,
"step": 10720
},
{
"epoch": 0.20192382263981556,
"grad_norm": 0.97265625,
"learning_rate": 9.999651137897182e-06,
"loss": 0.5857,
"step": 10740
},
{
"epoch": 0.20229984465590461,
"grad_norm": 1.0234375,
"learning_rate": 9.999639046661226e-06,
"loss": 0.5847,
"step": 10760
},
{
"epoch": 0.20267586667199364,
"grad_norm": 1.1953125,
"learning_rate": 9.999626749451688e-06,
"loss": 0.5865,
"step": 10780
},
{
"epoch": 0.2030518886880827,
"grad_norm": 1.0234375,
"learning_rate": 9.999614246269076e-06,
"loss": 0.5876,
"step": 10800
},
{
"epoch": 0.20342791070417174,
"grad_norm": 1.4296875,
"learning_rate": 9.999601537113908e-06,
"loss": 0.5751,
"step": 10820
},
{
"epoch": 0.20380393272026076,
"grad_norm": 1.125,
"learning_rate": 9.999588621986707e-06,
"loss": 0.5764,
"step": 10840
},
{
"epoch": 0.2041799547363498,
"grad_norm": 1.671875,
"learning_rate": 9.999575500888004e-06,
"loss": 0.5752,
"step": 10860
},
{
"epoch": 0.20455597675243886,
"grad_norm": 0.921875,
"learning_rate": 9.999562173818338e-06,
"loss": 0.5858,
"step": 10880
},
{
"epoch": 0.20493199876852788,
"grad_norm": 1.09375,
"learning_rate": 9.999548640778259e-06,
"loss": 0.5932,
"step": 10900
},
{
"epoch": 0.20530802078461693,
"grad_norm": 0.97265625,
"learning_rate": 9.999534901768326e-06,
"loss": 0.5797,
"step": 10920
},
{
"epoch": 0.20568404280070599,
"grad_norm": 1.0234375,
"learning_rate": 9.999520956789104e-06,
"loss": 0.5839,
"step": 10940
},
{
"epoch": 0.20606006481679504,
"grad_norm": 0.96484375,
"learning_rate": 9.999506805841169e-06,
"loss": 0.5883,
"step": 10960
},
{
"epoch": 0.20643608683288406,
"grad_norm": 1.125,
"learning_rate": 9.999492448925102e-06,
"loss": 0.5793,
"step": 10980
},
{
"epoch": 0.2068121088489731,
"grad_norm": 1.078125,
"learning_rate": 9.999477886041493e-06,
"loss": 0.5795,
"step": 11000
},
{
"epoch": 0.20718813086506216,
"grad_norm": 1.0859375,
"learning_rate": 9.999463117190945e-06,
"loss": 0.5798,
"step": 11020
},
{
"epoch": 0.20756415288115118,
"grad_norm": 1.15625,
"learning_rate": 9.999448142374066e-06,
"loss": 0.5855,
"step": 11040
},
{
"epoch": 0.20794017489724023,
"grad_norm": 0.85546875,
"learning_rate": 9.999432961591472e-06,
"loss": 0.6086,
"step": 11060
},
{
"epoch": 0.20831619691332928,
"grad_norm": 1.2265625,
"learning_rate": 9.999417574843788e-06,
"loss": 0.5777,
"step": 11080
},
{
"epoch": 0.2086922189294183,
"grad_norm": 1.0,
"learning_rate": 9.99940198213165e-06,
"loss": 0.5858,
"step": 11100
},
{
"epoch": 0.20906824094550736,
"grad_norm": 0.86328125,
"learning_rate": 9.9993861834557e-06,
"loss": 0.5761,
"step": 11120
},
{
"epoch": 0.2094442629615964,
"grad_norm": 1.4140625,
"learning_rate": 9.999370178816586e-06,
"loss": 0.5777,
"step": 11140
},
{
"epoch": 0.20982028497768546,
"grad_norm": 1.453125,
"learning_rate": 9.999353968214969e-06,
"loss": 0.5853,
"step": 11160
},
{
"epoch": 0.21019630699377448,
"grad_norm": 1.359375,
"learning_rate": 9.999337551651517e-06,
"loss": 0.5951,
"step": 11180
},
{
"epoch": 0.21057232900986353,
"grad_norm": 0.8828125,
"learning_rate": 9.999320929126909e-06,
"loss": 0.5874,
"step": 11200
},
{
"epoch": 0.21094835102595258,
"grad_norm": 0.859375,
"learning_rate": 9.999304100641824e-06,
"loss": 0.5924,
"step": 11220
},
{
"epoch": 0.2113243730420416,
"grad_norm": 1.015625,
"learning_rate": 9.99928706619696e-06,
"loss": 0.5927,
"step": 11240
},
{
"epoch": 0.21170039505813065,
"grad_norm": 0.91015625,
"learning_rate": 9.999269825793018e-06,
"loss": 0.5941,
"step": 11260
},
{
"epoch": 0.2120764170742197,
"grad_norm": 0.9296875,
"learning_rate": 9.999252379430707e-06,
"loss": 0.5873,
"step": 11280
},
{
"epoch": 0.21245243909030873,
"grad_norm": 0.8671875,
"learning_rate": 9.999234727110746e-06,
"loss": 0.586,
"step": 11300
},
{
"epoch": 0.21282846110639778,
"grad_norm": 0.8515625,
"learning_rate": 9.999216868833864e-06,
"loss": 0.5901,
"step": 11320
},
{
"epoch": 0.21320448312248683,
"grad_norm": 1.03125,
"learning_rate": 9.999198804600793e-06,
"loss": 0.5738,
"step": 11340
},
{
"epoch": 0.21358050513857588,
"grad_norm": 1.1328125,
"learning_rate": 9.999180534412281e-06,
"loss": 0.5837,
"step": 11360
},
{
"epoch": 0.2139565271546649,
"grad_norm": 0.98046875,
"learning_rate": 9.999162058269079e-06,
"loss": 0.58,
"step": 11380
},
{
"epoch": 0.21433254917075395,
"grad_norm": 0.8359375,
"learning_rate": 9.99914337617195e-06,
"loss": 0.5803,
"step": 11400
},
{
"epoch": 0.214708571186843,
"grad_norm": 0.94140625,
"learning_rate": 9.999124488121658e-06,
"loss": 0.5759,
"step": 11420
},
{
"epoch": 0.21508459320293202,
"grad_norm": 0.9921875,
"learning_rate": 9.999105394118988e-06,
"loss": 0.5867,
"step": 11440
},
{
"epoch": 0.21546061521902107,
"grad_norm": 0.8203125,
"learning_rate": 9.999086094164724e-06,
"loss": 0.5784,
"step": 11460
},
{
"epoch": 0.21583663723511012,
"grad_norm": 0.90625,
"learning_rate": 9.99906658825966e-06,
"loss": 0.5796,
"step": 11480
},
{
"epoch": 0.21621265925119915,
"grad_norm": 0.890625,
"learning_rate": 9.999046876404602e-06,
"loss": 0.5758,
"step": 11500
},
{
"epoch": 0.2165886812672882,
"grad_norm": 0.921875,
"learning_rate": 9.999026958600358e-06,
"loss": 0.5852,
"step": 11520
},
{
"epoch": 0.21696470328337725,
"grad_norm": 1.109375,
"learning_rate": 9.999006834847752e-06,
"loss": 0.576,
"step": 11540
},
{
"epoch": 0.2173407252994663,
"grad_norm": 0.83203125,
"learning_rate": 9.998986505147612e-06,
"loss": 0.5848,
"step": 11560
},
{
"epoch": 0.21771674731555532,
"grad_norm": 1.015625,
"learning_rate": 9.998965969500779e-06,
"loss": 0.5871,
"step": 11580
},
{
"epoch": 0.21809276933164437,
"grad_norm": 0.86328125,
"learning_rate": 9.99894522790809e-06,
"loss": 0.5829,
"step": 11600
},
{
"epoch": 0.21846879134773342,
"grad_norm": 0.94921875,
"learning_rate": 9.99892428037041e-06,
"loss": 0.5742,
"step": 11620
},
{
"epoch": 0.21884481336382244,
"grad_norm": 0.89453125,
"learning_rate": 9.998903126888595e-06,
"loss": 0.5841,
"step": 11640
},
{
"epoch": 0.2192208353799115,
"grad_norm": 1.0625,
"learning_rate": 9.998881767463519e-06,
"loss": 0.5819,
"step": 11660
},
{
"epoch": 0.21959685739600054,
"grad_norm": 1.0078125,
"learning_rate": 9.998860202096063e-06,
"loss": 0.5805,
"step": 11680
},
{
"epoch": 0.21997287941208957,
"grad_norm": 0.79296875,
"learning_rate": 9.998838430787112e-06,
"loss": 0.5785,
"step": 11700
},
{
"epoch": 0.22034890142817862,
"grad_norm": 1.0078125,
"learning_rate": 9.998816453537568e-06,
"loss": 0.5804,
"step": 11720
},
{
"epoch": 0.22072492344426767,
"grad_norm": 0.91796875,
"learning_rate": 9.998794270348331e-06,
"loss": 0.5854,
"step": 11740
},
{
"epoch": 0.2211009454603567,
"grad_norm": 0.890625,
"learning_rate": 9.998771881220319e-06,
"loss": 0.5857,
"step": 11760
},
{
"epoch": 0.22147696747644574,
"grad_norm": 0.8125,
"learning_rate": 9.99874928615445e-06,
"loss": 0.5855,
"step": 11780
},
{
"epoch": 0.2218529894925348,
"grad_norm": 0.9375,
"learning_rate": 9.99872648515166e-06,
"loss": 0.5736,
"step": 11800
},
{
"epoch": 0.22222901150862384,
"grad_norm": 0.86328125,
"learning_rate": 9.998703478212885e-06,
"loss": 0.5792,
"step": 11820
},
{
"epoch": 0.22260503352471286,
"grad_norm": 0.85546875,
"learning_rate": 9.998680265339076e-06,
"loss": 0.5709,
"step": 11840
},
{
"epoch": 0.22298105554080191,
"grad_norm": 0.93359375,
"learning_rate": 9.998656846531185e-06,
"loss": 0.5717,
"step": 11860
},
{
"epoch": 0.22335707755689096,
"grad_norm": 1.0,
"learning_rate": 9.99863322179018e-06,
"loss": 0.5719,
"step": 11880
},
{
"epoch": 0.22373309957298,
"grad_norm": 0.86328125,
"learning_rate": 9.99860939111703e-06,
"loss": 0.5874,
"step": 11900
},
{
"epoch": 0.22410912158906904,
"grad_norm": 0.90234375,
"learning_rate": 9.998585354512725e-06,
"loss": 0.5723,
"step": 11920
},
{
"epoch": 0.2244851436051581,
"grad_norm": 0.96875,
"learning_rate": 9.998561111978246e-06,
"loss": 0.5899,
"step": 11940
},
{
"epoch": 0.2248611656212471,
"grad_norm": 0.76953125,
"learning_rate": 9.998536663514599e-06,
"loss": 0.5824,
"step": 11960
},
{
"epoch": 0.22523718763733616,
"grad_norm": 0.98046875,
"learning_rate": 9.998512009122787e-06,
"loss": 0.5668,
"step": 11980
},
{
"epoch": 0.2256132096534252,
"grad_norm": 1.0234375,
"learning_rate": 9.998487148803826e-06,
"loss": 0.5701,
"step": 12000
},
{
"epoch": 0.22598923166951426,
"grad_norm": 1.0078125,
"learning_rate": 9.998462082558741e-06,
"loss": 0.576,
"step": 12020
},
{
"epoch": 0.22636525368560328,
"grad_norm": 0.90625,
"learning_rate": 9.998436810388566e-06,
"loss": 0.5761,
"step": 12040
},
{
"epoch": 0.22674127570169234,
"grad_norm": 1.34375,
"learning_rate": 9.998411332294341e-06,
"loss": 0.5786,
"step": 12060
},
{
"epoch": 0.22711729771778139,
"grad_norm": 0.8125,
"learning_rate": 9.998385648277116e-06,
"loss": 0.5758,
"step": 12080
},
{
"epoch": 0.2274933197338704,
"grad_norm": 0.9921875,
"learning_rate": 9.998359758337947e-06,
"loss": 0.5769,
"step": 12100
},
{
"epoch": 0.22786934174995946,
"grad_norm": 0.76953125,
"learning_rate": 9.998333662477903e-06,
"loss": 0.5666,
"step": 12120
},
{
"epoch": 0.2282453637660485,
"grad_norm": 0.79296875,
"learning_rate": 9.998307360698059e-06,
"loss": 0.5754,
"step": 12140
},
{
"epoch": 0.22862138578213753,
"grad_norm": 0.8359375,
"learning_rate": 9.998280852999496e-06,
"loss": 0.5627,
"step": 12160
},
{
"epoch": 0.22899740779822658,
"grad_norm": 0.8359375,
"learning_rate": 9.99825413938331e-06,
"loss": 0.5797,
"step": 12180
},
{
"epoch": 0.22937342981431563,
"grad_norm": 0.8515625,
"learning_rate": 9.998227219850597e-06,
"loss": 0.5875,
"step": 12200
},
{
"epoch": 0.22974945183040468,
"grad_norm": 0.8359375,
"learning_rate": 9.998200094402471e-06,
"loss": 0.5809,
"step": 12220
},
{
"epoch": 0.2301254738464937,
"grad_norm": 1.046875,
"learning_rate": 9.998172763040048e-06,
"loss": 0.5714,
"step": 12240
},
{
"epoch": 0.23050149586258276,
"grad_norm": 0.94140625,
"learning_rate": 9.99814522576445e-06,
"loss": 0.5755,
"step": 12260
},
{
"epoch": 0.2308775178786718,
"grad_norm": 0.9453125,
"learning_rate": 9.998117482576816e-06,
"loss": 0.5764,
"step": 12280
},
{
"epoch": 0.23125353989476083,
"grad_norm": 0.9375,
"learning_rate": 9.998089533478287e-06,
"loss": 0.5699,
"step": 12300
},
{
"epoch": 0.23162956191084988,
"grad_norm": 0.83203125,
"learning_rate": 9.998061378470016e-06,
"loss": 0.5814,
"step": 12320
},
{
"epoch": 0.23200558392693893,
"grad_norm": 0.78125,
"learning_rate": 9.998033017553162e-06,
"loss": 0.5776,
"step": 12340
},
{
"epoch": 0.23238160594302795,
"grad_norm": 0.88671875,
"learning_rate": 9.99800445072889e-06,
"loss": 0.5776,
"step": 12360
},
{
"epoch": 0.232757627959117,
"grad_norm": 0.87890625,
"learning_rate": 9.997975677998385e-06,
"loss": 0.574,
"step": 12380
},
{
"epoch": 0.23313364997520605,
"grad_norm": 1.328125,
"learning_rate": 9.997946699362825e-06,
"loss": 0.5668,
"step": 12400
},
{
"epoch": 0.2335096719912951,
"grad_norm": 0.80859375,
"learning_rate": 9.997917514823406e-06,
"loss": 0.5711,
"step": 12420
},
{
"epoch": 0.23388569400738413,
"grad_norm": 0.76171875,
"learning_rate": 9.99788812438133e-06,
"loss": 0.5556,
"step": 12440
},
{
"epoch": 0.23426171602347318,
"grad_norm": 0.96484375,
"learning_rate": 9.99785852803781e-06,
"loss": 0.5841,
"step": 12460
},
{
"epoch": 0.23463773803956223,
"grad_norm": 0.91015625,
"learning_rate": 9.997828725794061e-06,
"loss": 0.5763,
"step": 12480
},
{
"epoch": 0.23501376005565125,
"grad_norm": 0.79296875,
"learning_rate": 9.997798717651316e-06,
"loss": 0.5698,
"step": 12500
},
{
"epoch": 0.2353897820717403,
"grad_norm": 0.9140625,
"learning_rate": 9.99776850361081e-06,
"loss": 0.5708,
"step": 12520
},
{
"epoch": 0.23576580408782935,
"grad_norm": 0.75,
"learning_rate": 9.997738083673785e-06,
"loss": 0.5727,
"step": 12540
},
{
"epoch": 0.23614182610391837,
"grad_norm": 0.95703125,
"learning_rate": 9.997707457841496e-06,
"loss": 0.5596,
"step": 12560
},
{
"epoch": 0.23651784812000742,
"grad_norm": 1.1015625,
"learning_rate": 9.997676626115205e-06,
"loss": 0.5688,
"step": 12580
},
{
"epoch": 0.23689387013609647,
"grad_norm": 0.94140625,
"learning_rate": 9.997645588496181e-06,
"loss": 0.5598,
"step": 12600
},
{
"epoch": 0.23726989215218552,
"grad_norm": 0.84375,
"learning_rate": 9.997614344985705e-06,
"loss": 0.5573,
"step": 12620
},
{
"epoch": 0.23764591416827455,
"grad_norm": 0.8203125,
"learning_rate": 9.99758289558506e-06,
"loss": 0.5708,
"step": 12640
},
{
"epoch": 0.2380219361843636,
"grad_norm": 0.8984375,
"learning_rate": 9.997551240295546e-06,
"loss": 0.5752,
"step": 12660
},
{
"epoch": 0.23839795820045265,
"grad_norm": 0.73046875,
"learning_rate": 9.997519379118465e-06,
"loss": 0.5741,
"step": 12680
},
{
"epoch": 0.23877398021654167,
"grad_norm": 0.75390625,
"learning_rate": 9.99748731205513e-06,
"loss": 0.5625,
"step": 12700
},
{
"epoch": 0.23915000223263072,
"grad_norm": 1.09375,
"learning_rate": 9.997455039106861e-06,
"loss": 0.5751,
"step": 12720
},
{
"epoch": 0.23952602424871977,
"grad_norm": 0.84765625,
"learning_rate": 9.99742256027499e-06,
"loss": 0.5627,
"step": 12740
},
{
"epoch": 0.2399020462648088,
"grad_norm": 1.15625,
"learning_rate": 9.997389875560853e-06,
"loss": 0.5675,
"step": 12760
},
{
"epoch": 0.24027806828089784,
"grad_norm": 0.90234375,
"learning_rate": 9.997356984965798e-06,
"loss": 0.5751,
"step": 12780
},
{
"epoch": 0.2406540902969869,
"grad_norm": 0.83203125,
"learning_rate": 9.997323888491178e-06,
"loss": 0.5762,
"step": 12800
},
{
"epoch": 0.24103011231307592,
"grad_norm": 0.80859375,
"learning_rate": 9.997290586138357e-06,
"loss": 0.5744,
"step": 12820
},
{
"epoch": 0.24140613432916497,
"grad_norm": 0.703125,
"learning_rate": 9.99725707790871e-06,
"loss": 0.5676,
"step": 12840
},
{
"epoch": 0.24178215634525402,
"grad_norm": 0.76953125,
"learning_rate": 9.997223363803615e-06,
"loss": 0.5817,
"step": 12860
},
{
"epoch": 0.24215817836134307,
"grad_norm": 0.80859375,
"learning_rate": 9.99718944382446e-06,
"loss": 0.5763,
"step": 12880
},
{
"epoch": 0.2425342003774321,
"grad_norm": 0.8125,
"learning_rate": 9.997155317972643e-06,
"loss": 0.5745,
"step": 12900
},
{
"epoch": 0.24291022239352114,
"grad_norm": 0.7578125,
"learning_rate": 9.99712098624957e-06,
"loss": 0.5663,
"step": 12920
},
{
"epoch": 0.2432862444096102,
"grad_norm": 0.86328125,
"learning_rate": 9.997086448656658e-06,
"loss": 0.5695,
"step": 12940
},
{
"epoch": 0.24366226642569921,
"grad_norm": 0.71875,
"learning_rate": 9.997051705195326e-06,
"loss": 0.573,
"step": 12960
},
{
"epoch": 0.24403828844178826,
"grad_norm": 0.88671875,
"learning_rate": 9.997016755867008e-06,
"loss": 0.5698,
"step": 12980
},
{
"epoch": 0.24441431045787732,
"grad_norm": 0.921875,
"learning_rate": 9.996981600673144e-06,
"loss": 0.5666,
"step": 13000
},
{
"epoch": 0.24479033247396634,
"grad_norm": 0.75390625,
"learning_rate": 9.99694623961518e-06,
"loss": 0.5694,
"step": 13020
},
{
"epoch": 0.2451663544900554,
"grad_norm": 0.9140625,
"learning_rate": 9.996910672694573e-06,
"loss": 0.5574,
"step": 13040
},
{
"epoch": 0.24554237650614444,
"grad_norm": 0.94140625,
"learning_rate": 9.99687489991279e-06,
"loss": 0.5564,
"step": 13060
},
{
"epoch": 0.2459183985222335,
"grad_norm": 0.9140625,
"learning_rate": 9.996838921271304e-06,
"loss": 0.5666,
"step": 13080
},
{
"epoch": 0.2462944205383225,
"grad_norm": 0.890625,
"learning_rate": 9.996802736771597e-06,
"loss": 0.5758,
"step": 13100
},
{
"epoch": 0.24667044255441156,
"grad_norm": 0.69140625,
"learning_rate": 9.99676634641516e-06,
"loss": 0.5619,
"step": 13120
},
{
"epoch": 0.2470464645705006,
"grad_norm": 0.8203125,
"learning_rate": 9.996729750203493e-06,
"loss": 0.5817,
"step": 13140
},
{
"epoch": 0.24742248658658964,
"grad_norm": 0.84375,
"learning_rate": 9.996692948138102e-06,
"loss": 0.5705,
"step": 13160
},
{
"epoch": 0.24779850860267869,
"grad_norm": 0.8203125,
"learning_rate": 9.996655940220504e-06,
"loss": 0.5713,
"step": 13180
},
{
"epoch": 0.24817453061876774,
"grad_norm": 0.89453125,
"learning_rate": 9.996618726452223e-06,
"loss": 0.5715,
"step": 13200
},
{
"epoch": 0.24855055263485676,
"grad_norm": 0.8046875,
"learning_rate": 9.996581306834793e-06,
"loss": 0.5622,
"step": 13220
},
{
"epoch": 0.2489265746509458,
"grad_norm": 0.71875,
"learning_rate": 9.996543681369756e-06,
"loss": 0.5636,
"step": 13240
},
{
"epoch": 0.24930259666703486,
"grad_norm": 1.0703125,
"learning_rate": 9.996505850058663e-06,
"loss": 0.5753,
"step": 13260
},
{
"epoch": 0.2496786186831239,
"grad_norm": 0.796875,
"learning_rate": 9.996467812903067e-06,
"loss": 0.5774,
"step": 13280
}
],
"logging_steps": 20,
"max_steps": 319134,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 13297,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.9972288387703433e+20,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}