| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.24999823739679958, | |
| "eval_steps": 500, | |
| "global_step": 13297, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.000376022016089042, | |
| "grad_norm": 69.5, | |
| "learning_rate": 1.9843342036553526e-08, | |
| "loss": 2.5216, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.000752044032178084, | |
| "grad_norm": 75.0, | |
| "learning_rate": 4.073107049608355e-08, | |
| "loss": 2.4632, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.001128066048267126, | |
| "grad_norm": 109.0, | |
| "learning_rate": 6.161879895561358e-08, | |
| "loss": 2.5604, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.001504088064356168, | |
| "grad_norm": 48.5, | |
| "learning_rate": 8.250652741514362e-08, | |
| "loss": 2.4744, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0018801100804452101, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.0339425587467364e-07, | |
| "loss": 2.5512, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.002256132096534252, | |
| "grad_norm": 38.25, | |
| "learning_rate": 1.2428198433420367e-07, | |
| "loss": 2.4959, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.002632154112623294, | |
| "grad_norm": 30.5, | |
| "learning_rate": 1.451697127937337e-07, | |
| "loss": 2.5159, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.003008176128712336, | |
| "grad_norm": 29.0, | |
| "learning_rate": 1.660574412532637e-07, | |
| "loss": 2.5399, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.003384198144801378, | |
| "grad_norm": 29.625, | |
| "learning_rate": 1.8694516971279375e-07, | |
| "loss": 2.4812, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0037602201608904202, | |
| "grad_norm": 25.0, | |
| "learning_rate": 2.0783289817232378e-07, | |
| "loss": 2.4797, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.004136242176979462, | |
| "grad_norm": 24.625, | |
| "learning_rate": 2.2872062663185383e-07, | |
| "loss": 2.4898, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.004512264193068504, | |
| "grad_norm": 17.0, | |
| "learning_rate": 2.4960835509138383e-07, | |
| "loss": 2.4359, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.004888286209157546, | |
| "grad_norm": 39.75, | |
| "learning_rate": 2.7049608355091385e-07, | |
| "loss": 2.4451, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.005264308225246588, | |
| "grad_norm": 17.75, | |
| "learning_rate": 2.913838120104439e-07, | |
| "loss": 2.4747, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.00564033024133563, | |
| "grad_norm": 19.75, | |
| "learning_rate": 3.122715404699739e-07, | |
| "loss": 2.4672, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.006016352257424672, | |
| "grad_norm": 27.125, | |
| "learning_rate": 3.3315926892950393e-07, | |
| "loss": 2.44, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.006392374273513714, | |
| "grad_norm": 19.75, | |
| "learning_rate": 3.5404699738903396e-07, | |
| "loss": 2.494, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.006768396289602756, | |
| "grad_norm": 39.25, | |
| "learning_rate": 3.7493472584856404e-07, | |
| "loss": 2.4068, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.007144418305691798, | |
| "grad_norm": 27.75, | |
| "learning_rate": 3.95822454308094e-07, | |
| "loss": 2.3509, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.0075204403217808405, | |
| "grad_norm": 14.9375, | |
| "learning_rate": 4.1671018276762403e-07, | |
| "loss": 2.3596, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.007896462337869883, | |
| "grad_norm": 21.5, | |
| "learning_rate": 4.375979112271541e-07, | |
| "loss": 2.4322, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.008272484353958925, | |
| "grad_norm": 17.25, | |
| "learning_rate": 4.584856396866841e-07, | |
| "loss": 2.4769, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.008648506370047966, | |
| "grad_norm": 24.375, | |
| "learning_rate": 4.793733681462142e-07, | |
| "loss": 2.3957, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.009024528386137008, | |
| "grad_norm": 16.875, | |
| "learning_rate": 5.002610966057442e-07, | |
| "loss": 2.4445, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.00940055040222605, | |
| "grad_norm": 21.75, | |
| "learning_rate": 5.211488250652742e-07, | |
| "loss": 2.4009, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.009776572418315092, | |
| "grad_norm": 26.375, | |
| "learning_rate": 5.420365535248042e-07, | |
| "loss": 2.3618, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.010152594434404135, | |
| "grad_norm": 24.5, | |
| "learning_rate": 5.629242819843343e-07, | |
| "loss": 2.3718, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.010528616450493177, | |
| "grad_norm": 19.875, | |
| "learning_rate": 5.838120104438643e-07, | |
| "loss": 2.3708, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.010904638466582219, | |
| "grad_norm": 18.875, | |
| "learning_rate": 6.046997389033943e-07, | |
| "loss": 2.4253, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.01128066048267126, | |
| "grad_norm": 28.0, | |
| "learning_rate": 6.255874673629243e-07, | |
| "loss": 2.3592, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.011656682498760302, | |
| "grad_norm": 32.25, | |
| "learning_rate": 6.464751958224544e-07, | |
| "loss": 2.3199, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.012032704514849344, | |
| "grad_norm": 33.25, | |
| "learning_rate": 6.673629242819844e-07, | |
| "loss": 2.3505, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.012408726530938387, | |
| "grad_norm": 41.0, | |
| "learning_rate": 6.882506527415145e-07, | |
| "loss": 2.3872, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.012784748547027429, | |
| "grad_norm": 15.625, | |
| "learning_rate": 7.091383812010443e-07, | |
| "loss": 2.3008, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.01316077056311647, | |
| "grad_norm": 31.5, | |
| "learning_rate": 7.300261096605745e-07, | |
| "loss": 2.168, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.013536792579205512, | |
| "grad_norm": 71.0, | |
| "learning_rate": 7.509138381201045e-07, | |
| "loss": 2.2318, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.013912814595294554, | |
| "grad_norm": 32.25, | |
| "learning_rate": 7.718015665796345e-07, | |
| "loss": 2.2759, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.014288836611383596, | |
| "grad_norm": 71.0, | |
| "learning_rate": 7.926892950391646e-07, | |
| "loss": 2.2838, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.01466485862747264, | |
| "grad_norm": 37.0, | |
| "learning_rate": 8.135770234986947e-07, | |
| "loss": 2.2449, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.015040880643561681, | |
| "grad_norm": 100.0, | |
| "learning_rate": 8.344647519582245e-07, | |
| "loss": 2.2566, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.015416902659650723, | |
| "grad_norm": 52.5, | |
| "learning_rate": 8.553524804177546e-07, | |
| "loss": 2.2765, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.015792924675739766, | |
| "grad_norm": 30.0, | |
| "learning_rate": 8.762402088772847e-07, | |
| "loss": 2.2282, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.016168946691828806, | |
| "grad_norm": 22.75, | |
| "learning_rate": 8.971279373368147e-07, | |
| "loss": 2.2817, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.01654496870791785, | |
| "grad_norm": 70.5, | |
| "learning_rate": 9.180156657963447e-07, | |
| "loss": 2.209, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.01692099072400689, | |
| "grad_norm": 107.0, | |
| "learning_rate": 9.389033942558748e-07, | |
| "loss": 2.2978, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.017297012740095933, | |
| "grad_norm": 52.5, | |
| "learning_rate": 9.597911227154048e-07, | |
| "loss": 2.2589, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.017673034756184973, | |
| "grad_norm": 90.0, | |
| "learning_rate": 9.806788511749348e-07, | |
| "loss": 2.1987, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.018049056772274016, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.0015665796344648e-06, | |
| "loss": 2.1721, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.01842507878836306, | |
| "grad_norm": 191.0, | |
| "learning_rate": 1.0224543080939948e-06, | |
| "loss": 2.2062, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.0188011008044521, | |
| "grad_norm": 67.5, | |
| "learning_rate": 1.0433420365535249e-06, | |
| "loss": 2.1984, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.019177122820541143, | |
| "grad_norm": 61.25, | |
| "learning_rate": 1.0642297650130549e-06, | |
| "loss": 2.2189, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.019553144836630183, | |
| "grad_norm": 49.5, | |
| "learning_rate": 1.085117493472585e-06, | |
| "loss": 2.2025, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.019929166852719227, | |
| "grad_norm": 80.5, | |
| "learning_rate": 1.1060052219321151e-06, | |
| "loss": 2.1776, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.02030518886880827, | |
| "grad_norm": 139.0, | |
| "learning_rate": 1.126892950391645e-06, | |
| "loss": 2.1829, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.02068121088489731, | |
| "grad_norm": 127.0, | |
| "learning_rate": 1.147780678851175e-06, | |
| "loss": 2.1865, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.021057232900986354, | |
| "grad_norm": 163.0, | |
| "learning_rate": 1.168668407310705e-06, | |
| "loss": 2.1854, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.021433254917075394, | |
| "grad_norm": 71.5, | |
| "learning_rate": 1.189556135770235e-06, | |
| "loss": 2.163, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.021809276933164437, | |
| "grad_norm": 98.0, | |
| "learning_rate": 1.210443864229765e-06, | |
| "loss": 2.071, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.02218529894925348, | |
| "grad_norm": 165.0, | |
| "learning_rate": 1.2313315926892953e-06, | |
| "loss": 2.145, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.02256132096534252, | |
| "grad_norm": 35.5, | |
| "learning_rate": 1.2522193211488251e-06, | |
| "loss": 2.0652, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.022937342981431564, | |
| "grad_norm": 115.5, | |
| "learning_rate": 1.2731070496083554e-06, | |
| "loss": 2.1296, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.023313364997520604, | |
| "grad_norm": 90.0, | |
| "learning_rate": 1.2939947780678852e-06, | |
| "loss": 2.0437, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.023689387013609647, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.3148825065274152e-06, | |
| "loss": 2.0833, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.024065409029698687, | |
| "grad_norm": 32.75, | |
| "learning_rate": 1.3357702349869452e-06, | |
| "loss": 2.0352, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.02444143104578773, | |
| "grad_norm": 153.0, | |
| "learning_rate": 1.3566579634464752e-06, | |
| "loss": 2.054, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.024817453061876774, | |
| "grad_norm": 149.0, | |
| "learning_rate": 1.3775456919060055e-06, | |
| "loss": 2.0609, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.025193475077965814, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.3984334203655353e-06, | |
| "loss": 1.97, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.025569497094054858, | |
| "grad_norm": 121.5, | |
| "learning_rate": 1.4193211488250655e-06, | |
| "loss": 1.9576, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.025945519110143898, | |
| "grad_norm": 31.0, | |
| "learning_rate": 1.4402088772845953e-06, | |
| "loss": 2.0156, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.02632154112623294, | |
| "grad_norm": 133.0, | |
| "learning_rate": 1.4610966057441254e-06, | |
| "loss": 2.0576, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.026697563142321985, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.4819843342036556e-06, | |
| "loss": 2.0427, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.027073585158411025, | |
| "grad_norm": 89.5, | |
| "learning_rate": 1.5028720626631854e-06, | |
| "loss": 1.9948, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.027449607174500068, | |
| "grad_norm": 123.5, | |
| "learning_rate": 1.5237597911227157e-06, | |
| "loss": 1.987, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.027825629190589108, | |
| "grad_norm": 72.0, | |
| "learning_rate": 1.5446475195822455e-06, | |
| "loss": 2.011, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.02820165120667815, | |
| "grad_norm": 58.5, | |
| "learning_rate": 1.5655352480417757e-06, | |
| "loss": 1.9926, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.02857767322276719, | |
| "grad_norm": 50.5, | |
| "learning_rate": 1.5864229765013055e-06, | |
| "loss": 1.9495, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.028953695238856235, | |
| "grad_norm": 58.5, | |
| "learning_rate": 1.6073107049608356e-06, | |
| "loss": 1.9988, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.02932971725494528, | |
| "grad_norm": 76.0, | |
| "learning_rate": 1.6281984334203658e-06, | |
| "loss": 1.98, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.02970573927103432, | |
| "grad_norm": 99.0, | |
| "learning_rate": 1.6490861618798956e-06, | |
| "loss": 1.9849, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.030081761287123362, | |
| "grad_norm": 116.0, | |
| "learning_rate": 1.6699738903394258e-06, | |
| "loss": 1.9464, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.030457783303212402, | |
| "grad_norm": 119.0, | |
| "learning_rate": 1.6908616187989557e-06, | |
| "loss": 1.9654, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.030833805319301445, | |
| "grad_norm": 130.0, | |
| "learning_rate": 1.7117493472584859e-06, | |
| "loss": 1.9718, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.03120982733539049, | |
| "grad_norm": 200.0, | |
| "learning_rate": 1.732637075718016e-06, | |
| "loss": 1.9127, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.03158584935147953, | |
| "grad_norm": 228.0, | |
| "learning_rate": 1.7535248041775457e-06, | |
| "loss": 1.9649, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.03196187136756857, | |
| "grad_norm": 159.0, | |
| "learning_rate": 1.774412532637076e-06, | |
| "loss": 1.8952, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.03233789338365761, | |
| "grad_norm": 87.0, | |
| "learning_rate": 1.7953002610966058e-06, | |
| "loss": 1.9328, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.032713915399746656, | |
| "grad_norm": 129.0, | |
| "learning_rate": 1.816187989556136e-06, | |
| "loss": 1.9531, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.0330899374158357, | |
| "grad_norm": 187.0, | |
| "learning_rate": 1.8370757180156658e-06, | |
| "loss": 1.8911, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.03346595943192474, | |
| "grad_norm": 140.0, | |
| "learning_rate": 1.857963446475196e-06, | |
| "loss": 1.9228, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.03384198144801378, | |
| "grad_norm": 152.0, | |
| "learning_rate": 1.878851174934726e-06, | |
| "loss": 1.9264, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.03421800346410282, | |
| "grad_norm": 112.0, | |
| "learning_rate": 1.899738903394256e-06, | |
| "loss": 1.957, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.034594025480191866, | |
| "grad_norm": 81.5, | |
| "learning_rate": 1.920626631853786e-06, | |
| "loss": 1.9084, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.03497004749628091, | |
| "grad_norm": 52.25, | |
| "learning_rate": 1.941514360313316e-06, | |
| "loss": 1.895, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.035346069512369946, | |
| "grad_norm": 52.75, | |
| "learning_rate": 1.9624020887728464e-06, | |
| "loss": 1.8667, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.03572209152845899, | |
| "grad_norm": 73.0, | |
| "learning_rate": 1.9832898172323762e-06, | |
| "loss": 1.8782, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.03609811354454803, | |
| "grad_norm": 157.0, | |
| "learning_rate": 2.004177545691906e-06, | |
| "loss": 1.8986, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.036474135560637076, | |
| "grad_norm": 124.5, | |
| "learning_rate": 2.0250652741514363e-06, | |
| "loss": 1.8863, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.03685015757672612, | |
| "grad_norm": 211.0, | |
| "learning_rate": 2.045953002610966e-06, | |
| "loss": 1.8851, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.037226179592815156, | |
| "grad_norm": 157.0, | |
| "learning_rate": 2.0668407310704963e-06, | |
| "loss": 1.8669, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.0376022016089042, | |
| "grad_norm": 163.0, | |
| "learning_rate": 2.087728459530026e-06, | |
| "loss": 1.873, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.03797822362499324, | |
| "grad_norm": 76.5, | |
| "learning_rate": 2.1086161879895564e-06, | |
| "loss": 1.8493, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.03835424564108229, | |
| "grad_norm": 250.0, | |
| "learning_rate": 2.129503916449086e-06, | |
| "loss": 1.8835, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.03873026765717133, | |
| "grad_norm": 86.0, | |
| "learning_rate": 2.1503916449086164e-06, | |
| "loss": 1.8291, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.03910628967326037, | |
| "grad_norm": 81.5, | |
| "learning_rate": 2.1712793733681462e-06, | |
| "loss": 1.8068, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.03948231168934941, | |
| "grad_norm": 94.5, | |
| "learning_rate": 2.1921671018276765e-06, | |
| "loss": 1.7797, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.03985833370543845, | |
| "grad_norm": 153.0, | |
| "learning_rate": 2.2130548302872067e-06, | |
| "loss": 1.8606, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.0402343557215275, | |
| "grad_norm": 160.0, | |
| "learning_rate": 2.2339425587467365e-06, | |
| "loss": 1.8179, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.04061037773761654, | |
| "grad_norm": 252.0, | |
| "learning_rate": 2.2548302872062668e-06, | |
| "loss": 1.8003, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.04098639975370558, | |
| "grad_norm": 272.0, | |
| "learning_rate": 2.2757180156657966e-06, | |
| "loss": 1.7933, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.04136242176979462, | |
| "grad_norm": 66.5, | |
| "learning_rate": 2.2966057441253264e-06, | |
| "loss": 1.8021, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.041738443785883664, | |
| "grad_norm": 132.0, | |
| "learning_rate": 2.3174934725848566e-06, | |
| "loss": 1.757, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.04211446580197271, | |
| "grad_norm": 101.0, | |
| "learning_rate": 2.3383812010443865e-06, | |
| "loss": 1.7466, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.04249048781806175, | |
| "grad_norm": 111.5, | |
| "learning_rate": 2.3592689295039167e-06, | |
| "loss": 1.7771, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.04286650983415079, | |
| "grad_norm": 64.0, | |
| "learning_rate": 2.3801566579634465e-06, | |
| "loss": 1.754, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.04324253185023983, | |
| "grad_norm": 220.0, | |
| "learning_rate": 2.4010443864229767e-06, | |
| "loss": 1.7484, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.043618553866328874, | |
| "grad_norm": 97.5, | |
| "learning_rate": 2.4219321148825066e-06, | |
| "loss": 1.7204, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.04399457588241792, | |
| "grad_norm": 110.5, | |
| "learning_rate": 2.442819843342037e-06, | |
| "loss": 1.7732, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.04437059789850696, | |
| "grad_norm": 73.5, | |
| "learning_rate": 2.463707571801567e-06, | |
| "loss": 1.7447, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.044746619914596, | |
| "grad_norm": 78.5, | |
| "learning_rate": 2.484595300261097e-06, | |
| "loss": 1.7127, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.04512264193068504, | |
| "grad_norm": 63.25, | |
| "learning_rate": 2.5054830287206267e-06, | |
| "loss": 1.6951, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.045498663946774084, | |
| "grad_norm": 56.25, | |
| "learning_rate": 2.5263707571801573e-06, | |
| "loss": 1.6848, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.04587468596286313, | |
| "grad_norm": 69.0, | |
| "learning_rate": 2.547258485639687e-06, | |
| "loss": 1.7051, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.046250707978952164, | |
| "grad_norm": 83.0, | |
| "learning_rate": 2.568146214099217e-06, | |
| "loss": 1.6354, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.04662672999504121, | |
| "grad_norm": 90.0, | |
| "learning_rate": 2.5890339425587468e-06, | |
| "loss": 1.643, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.04700275201113025, | |
| "grad_norm": 62.5, | |
| "learning_rate": 2.6099216710182766e-06, | |
| "loss": 1.6811, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.047378774027219295, | |
| "grad_norm": 199.0, | |
| "learning_rate": 2.6308093994778072e-06, | |
| "loss": 1.6851, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.04775479604330834, | |
| "grad_norm": 57.75, | |
| "learning_rate": 2.651697127937337e-06, | |
| "loss": 1.6055, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.048130818059397375, | |
| "grad_norm": 196.0, | |
| "learning_rate": 2.672584856396867e-06, | |
| "loss": 1.6079, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.04850684007548642, | |
| "grad_norm": 149.0, | |
| "learning_rate": 2.693472584856397e-06, | |
| "loss": 1.6273, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.04888286209157546, | |
| "grad_norm": 95.5, | |
| "learning_rate": 2.714360313315927e-06, | |
| "loss": 1.6333, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.049258884107664505, | |
| "grad_norm": 72.0, | |
| "learning_rate": 2.735248041775457e-06, | |
| "loss": 1.6026, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.04963490612375355, | |
| "grad_norm": 338.0, | |
| "learning_rate": 2.7561357702349874e-06, | |
| "loss": 1.5909, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.050010928139842585, | |
| "grad_norm": 65.5, | |
| "learning_rate": 2.777023498694517e-06, | |
| "loss": 1.6058, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.05038695015593163, | |
| "grad_norm": 126.0, | |
| "learning_rate": 2.797911227154047e-06, | |
| "loss": 1.5821, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.05076297217202067, | |
| "grad_norm": 142.0, | |
| "learning_rate": 2.8187989556135777e-06, | |
| "loss": 1.5928, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.051138994188109715, | |
| "grad_norm": 59.0, | |
| "learning_rate": 2.8396866840731075e-06, | |
| "loss": 1.5513, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.05151501620419876, | |
| "grad_norm": 173.0, | |
| "learning_rate": 2.8605744125326373e-06, | |
| "loss": 1.519, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.051891038220287795, | |
| "grad_norm": 118.5, | |
| "learning_rate": 2.881462140992167e-06, | |
| "loss": 1.5389, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.05226706023637684, | |
| "grad_norm": 121.5, | |
| "learning_rate": 2.9023498694516974e-06, | |
| "loss": 1.5027, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.05264308225246588, | |
| "grad_norm": 71.5, | |
| "learning_rate": 2.9232375979112276e-06, | |
| "loss": 1.5588, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.053019104268554926, | |
| "grad_norm": 148.0, | |
| "learning_rate": 2.9441253263707574e-06, | |
| "loss": 1.544, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.05339512628464397, | |
| "grad_norm": 98.5, | |
| "learning_rate": 2.9650130548302876e-06, | |
| "loss": 1.4796, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.053771148300733006, | |
| "grad_norm": 119.5, | |
| "learning_rate": 2.9859007832898175e-06, | |
| "loss": 1.5498, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.05414717031682205, | |
| "grad_norm": 68.0, | |
| "learning_rate": 3.0067885117493473e-06, | |
| "loss": 1.5174, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.05452319233291109, | |
| "grad_norm": 81.0, | |
| "learning_rate": 3.027676240208878e-06, | |
| "loss": 1.5218, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.054899214349000136, | |
| "grad_norm": 89.5, | |
| "learning_rate": 3.0485639686684078e-06, | |
| "loss": 1.4837, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.05527523636508917, | |
| "grad_norm": 175.0, | |
| "learning_rate": 3.0694516971279376e-06, | |
| "loss": 1.469, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.055651258381178216, | |
| "grad_norm": 188.0, | |
| "learning_rate": 3.0903394255874674e-06, | |
| "loss": 1.4704, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.05602728039726726, | |
| "grad_norm": 53.5, | |
| "learning_rate": 3.111227154046997e-06, | |
| "loss": 1.4528, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.0564033024133563, | |
| "grad_norm": 91.0, | |
| "learning_rate": 3.132114882506528e-06, | |
| "loss": 1.4783, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.056779324429445346, | |
| "grad_norm": 81.5, | |
| "learning_rate": 3.1530026109660577e-06, | |
| "loss": 1.4367, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.05715534644553438, | |
| "grad_norm": 69.0, | |
| "learning_rate": 3.1738903394255875e-06, | |
| "loss": 1.4717, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.057531368461623426, | |
| "grad_norm": 207.0, | |
| "learning_rate": 3.1947780678851177e-06, | |
| "loss": 1.4713, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.05790739047771247, | |
| "grad_norm": 87.5, | |
| "learning_rate": 3.215665796344648e-06, | |
| "loss": 1.4269, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.05828341249380151, | |
| "grad_norm": 87.0, | |
| "learning_rate": 3.2365535248041778e-06, | |
| "loss": 1.4116, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.05865943450989056, | |
| "grad_norm": 133.0, | |
| "learning_rate": 3.257441253263708e-06, | |
| "loss": 1.423, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.05903545652597959, | |
| "grad_norm": 66.0, | |
| "learning_rate": 3.278328981723238e-06, | |
| "loss": 1.3921, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.05941147854206864, | |
| "grad_norm": 70.0, | |
| "learning_rate": 3.2992167101827676e-06, | |
| "loss": 1.4027, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.05978750055815768, | |
| "grad_norm": 114.5, | |
| "learning_rate": 3.3201044386422983e-06, | |
| "loss": 1.4118, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.060163522574246724, | |
| "grad_norm": 64.5, | |
| "learning_rate": 3.340992167101828e-06, | |
| "loss": 1.3805, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.06053954459033577, | |
| "grad_norm": 76.0, | |
| "learning_rate": 3.361879895561358e-06, | |
| "loss": 1.3966, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.060915566606424804, | |
| "grad_norm": 52.5, | |
| "learning_rate": 3.3827676240208877e-06, | |
| "loss": 1.3991, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.06129158862251385, | |
| "grad_norm": 139.0, | |
| "learning_rate": 3.403655352480418e-06, | |
| "loss": 1.3851, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.06166761063860289, | |
| "grad_norm": 56.25, | |
| "learning_rate": 3.4245430809399482e-06, | |
| "loss": 1.3506, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.062043632654691934, | |
| "grad_norm": 86.0, | |
| "learning_rate": 3.445430809399478e-06, | |
| "loss": 1.3288, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.06241965467078098, | |
| "grad_norm": 108.5, | |
| "learning_rate": 3.4663185378590083e-06, | |
| "loss": 1.3767, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.06279567668687001, | |
| "grad_norm": 46.5, | |
| "learning_rate": 3.487206266318538e-06, | |
| "loss": 1.339, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.06317169870295906, | |
| "grad_norm": 127.5, | |
| "learning_rate": 3.5080939947780683e-06, | |
| "loss": 1.3316, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.0635477207190481, | |
| "grad_norm": 53.0, | |
| "learning_rate": 3.5289817232375986e-06, | |
| "loss": 1.362, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.06392374273513714, | |
| "grad_norm": 75.5, | |
| "learning_rate": 3.5498694516971284e-06, | |
| "loss": 1.3073, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.06429976475122619, | |
| "grad_norm": 103.5, | |
| "learning_rate": 3.570757180156658e-06, | |
| "loss": 1.3008, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.06467578676731522, | |
| "grad_norm": 50.25, | |
| "learning_rate": 3.591644908616188e-06, | |
| "loss": 1.3438, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.06505180878340427, | |
| "grad_norm": 108.0, | |
| "learning_rate": 3.6125326370757187e-06, | |
| "loss": 1.3175, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.06542783079949331, | |
| "grad_norm": 97.0, | |
| "learning_rate": 3.6334203655352485e-06, | |
| "loss": 1.3031, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.06580385281558235, | |
| "grad_norm": 124.5, | |
| "learning_rate": 3.6543080939947783e-06, | |
| "loss": 1.3161, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.0661798748316714, | |
| "grad_norm": 78.5, | |
| "learning_rate": 3.675195822454308e-06, | |
| "loss": 1.2822, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.06655589684776043, | |
| "grad_norm": 81.5, | |
| "learning_rate": 3.6960835509138383e-06, | |
| "loss": 1.3111, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.06693191886384948, | |
| "grad_norm": 94.5, | |
| "learning_rate": 3.7169712793733686e-06, | |
| "loss": 1.2909, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.06730794087993852, | |
| "grad_norm": 86.0, | |
| "learning_rate": 3.7378590078328984e-06, | |
| "loss": 1.2535, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.06768396289602756, | |
| "grad_norm": 142.0, | |
| "learning_rate": 3.7587467362924286e-06, | |
| "loss": 1.2963, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.06805998491211661, | |
| "grad_norm": 58.25, | |
| "learning_rate": 3.7796344647519584e-06, | |
| "loss": 1.2354, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.06843600692820564, | |
| "grad_norm": 67.5, | |
| "learning_rate": 3.8005221932114883e-06, | |
| "loss": 1.2719, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.0688120289442947, | |
| "grad_norm": 103.0, | |
| "learning_rate": 3.821409921671019e-06, | |
| "loss": 1.246, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.06918805096038373, | |
| "grad_norm": 110.0, | |
| "learning_rate": 3.842297650130548e-06, | |
| "loss": 1.2397, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.06956407297647277, | |
| "grad_norm": 45.5, | |
| "learning_rate": 3.8631853785900785e-06, | |
| "loss": 1.2576, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.06994009499256182, | |
| "grad_norm": 62.0, | |
| "learning_rate": 3.884073107049609e-06, | |
| "loss": 1.2273, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.07031611700865086, | |
| "grad_norm": 77.5, | |
| "learning_rate": 3.904960835509139e-06, | |
| "loss": 1.2366, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.07069213902473989, | |
| "grad_norm": 89.5, | |
| "learning_rate": 3.925848563968669e-06, | |
| "loss": 1.2027, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.07106816104082894, | |
| "grad_norm": 84.0, | |
| "learning_rate": 3.946736292428199e-06, | |
| "loss": 1.2029, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.07144418305691798, | |
| "grad_norm": 51.0, | |
| "learning_rate": 3.967624020887729e-06, | |
| "loss": 1.2181, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.07182020507300703, | |
| "grad_norm": 71.0, | |
| "learning_rate": 3.988511749347258e-06, | |
| "loss": 1.2405, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.07219622708909607, | |
| "grad_norm": 78.0, | |
| "learning_rate": 4.009399477806789e-06, | |
| "loss": 1.1956, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.0725722491051851, | |
| "grad_norm": 76.0, | |
| "learning_rate": 4.030287206266319e-06, | |
| "loss": 1.2008, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.07294827112127415, | |
| "grad_norm": 71.0, | |
| "learning_rate": 4.051174934725849e-06, | |
| "loss": 1.209, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.07332429313736319, | |
| "grad_norm": 91.0, | |
| "learning_rate": 4.072062663185378e-06, | |
| "loss": 1.2152, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.07370031515345224, | |
| "grad_norm": 65.5, | |
| "learning_rate": 4.092950391644909e-06, | |
| "loss": 1.2116, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.07407633716954128, | |
| "grad_norm": 84.0, | |
| "learning_rate": 4.113838120104439e-06, | |
| "loss": 1.1892, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.07445235918563031, | |
| "grad_norm": 80.0, | |
| "learning_rate": 4.134725848563969e-06, | |
| "loss": 1.1591, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.07482838120171936, | |
| "grad_norm": 93.0, | |
| "learning_rate": 4.155613577023499e-06, | |
| "loss": 1.1767, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.0752044032178084, | |
| "grad_norm": 83.0, | |
| "learning_rate": 4.176501305483029e-06, | |
| "loss": 1.1395, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.07558042523389745, | |
| "grad_norm": 72.5, | |
| "learning_rate": 4.197389033942559e-06, | |
| "loss": 1.1689, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.07595644724998649, | |
| "grad_norm": 58.5, | |
| "learning_rate": 4.218276762402089e-06, | |
| "loss": 1.1351, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.07633246926607552, | |
| "grad_norm": 64.5, | |
| "learning_rate": 4.2391644908616194e-06, | |
| "loss": 1.1314, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.07670849128216457, | |
| "grad_norm": 66.5, | |
| "learning_rate": 4.260052219321149e-06, | |
| "loss": 1.124, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.07708451329825361, | |
| "grad_norm": 42.75, | |
| "learning_rate": 4.280939947780679e-06, | |
| "loss": 1.0991, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.07746053531434266, | |
| "grad_norm": 63.25, | |
| "learning_rate": 4.301827676240209e-06, | |
| "loss": 1.1353, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.0778365573304317, | |
| "grad_norm": 47.5, | |
| "learning_rate": 4.3227154046997395e-06, | |
| "loss": 1.1189, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.07821257934652073, | |
| "grad_norm": 34.0, | |
| "learning_rate": 4.343603133159269e-06, | |
| "loss": 1.1196, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.07858860136260978, | |
| "grad_norm": 140.0, | |
| "learning_rate": 4.364490861618799e-06, | |
| "loss": 1.0964, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.07896462337869882, | |
| "grad_norm": 60.75, | |
| "learning_rate": 4.385378590078329e-06, | |
| "loss": 1.1315, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.07934064539478787, | |
| "grad_norm": 57.75, | |
| "learning_rate": 4.40626631853786e-06, | |
| "loss": 1.0911, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.0797166674108769, | |
| "grad_norm": 70.5, | |
| "learning_rate": 4.42715404699739e-06, | |
| "loss": 1.084, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.08009268942696594, | |
| "grad_norm": 29.25, | |
| "learning_rate": 4.448041775456919e-06, | |
| "loss": 1.0646, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.080468711443055, | |
| "grad_norm": 33.0, | |
| "learning_rate": 4.4689295039164495e-06, | |
| "loss": 1.0657, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.08084473345914403, | |
| "grad_norm": 75.5, | |
| "learning_rate": 4.489817232375979e-06, | |
| "loss": 1.0553, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.08122075547523308, | |
| "grad_norm": 49.75, | |
| "learning_rate": 4.51070496083551e-06, | |
| "loss": 1.0675, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.08159677749132212, | |
| "grad_norm": 40.25, | |
| "learning_rate": 4.531592689295039e-06, | |
| "loss": 1.0539, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.08197279950741115, | |
| "grad_norm": 40.5, | |
| "learning_rate": 4.55248041775457e-06, | |
| "loss": 1.0381, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.0823488215235002, | |
| "grad_norm": 51.75, | |
| "learning_rate": 4.573368146214099e-06, | |
| "loss": 1.062, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.08272484353958924, | |
| "grad_norm": 120.5, | |
| "learning_rate": 4.59425587467363e-06, | |
| "loss": 1.0553, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.08310086555567829, | |
| "grad_norm": 51.75, | |
| "learning_rate": 4.6151436031331595e-06, | |
| "loss": 1.0386, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.08347688757176733, | |
| "grad_norm": 29.125, | |
| "learning_rate": 4.63603133159269e-06, | |
| "loss": 1.0334, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.08385290958785636, | |
| "grad_norm": 43.25, | |
| "learning_rate": 4.65691906005222e-06, | |
| "loss": 1.0358, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.08422893160394541, | |
| "grad_norm": 34.5, | |
| "learning_rate": 4.677806788511749e-06, | |
| "loss": 1.0064, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.08460495362003445, | |
| "grad_norm": 40.25, | |
| "learning_rate": 4.6986945169712796e-06, | |
| "loss": 1.0205, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.0849809756361235, | |
| "grad_norm": 37.75, | |
| "learning_rate": 4.71958224543081e-06, | |
| "loss": 0.9937, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.08535699765221254, | |
| "grad_norm": 41.25, | |
| "learning_rate": 4.74046997389034e-06, | |
| "loss": 1.0081, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.08573301966830157, | |
| "grad_norm": 59.25, | |
| "learning_rate": 4.7613577023498694e-06, | |
| "loss": 0.9894, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.08610904168439062, | |
| "grad_norm": 57.5, | |
| "learning_rate": 4.7822454308094e-06, | |
| "loss": 0.9949, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.08648506370047966, | |
| "grad_norm": 46.5, | |
| "learning_rate": 4.80313315926893e-06, | |
| "loss": 1.0066, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.08686108571656871, | |
| "grad_norm": 46.0, | |
| "learning_rate": 4.82402088772846e-06, | |
| "loss": 1.001, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.08723710773265775, | |
| "grad_norm": 34.5, | |
| "learning_rate": 4.8449086161879895e-06, | |
| "loss": 0.9981, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.08761312974874678, | |
| "grad_norm": 49.75, | |
| "learning_rate": 4.86579634464752e-06, | |
| "loss": 0.9975, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.08798915176483584, | |
| "grad_norm": 25.125, | |
| "learning_rate": 4.88668407310705e-06, | |
| "loss": 0.9697, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.08836517378092487, | |
| "grad_norm": 68.0, | |
| "learning_rate": 4.90757180156658e-06, | |
| "loss": 0.9851, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.08874119579701392, | |
| "grad_norm": 122.0, | |
| "learning_rate": 4.9284595300261105e-06, | |
| "loss": 0.9624, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.08911721781310296, | |
| "grad_norm": 68.5, | |
| "learning_rate": 4.94934725848564e-06, | |
| "loss": 0.9812, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.089493239829192, | |
| "grad_norm": 43.5, | |
| "learning_rate": 4.97023498694517e-06, | |
| "loss": 0.9594, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.08986926184528105, | |
| "grad_norm": 26.625, | |
| "learning_rate": 4.9911227154047e-06, | |
| "loss": 0.9582, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.09024528386137008, | |
| "grad_norm": 30.375, | |
| "learning_rate": 5.012010443864231e-06, | |
| "loss": 0.9374, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.09062130587745912, | |
| "grad_norm": 43.75, | |
| "learning_rate": 5.03289817232376e-06, | |
| "loss": 0.9564, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.09099732789354817, | |
| "grad_norm": 61.5, | |
| "learning_rate": 5.05378590078329e-06, | |
| "loss": 0.9293, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.0913733499096372, | |
| "grad_norm": 34.25, | |
| "learning_rate": 5.07467362924282e-06, | |
| "loss": 0.9345, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.09174937192572626, | |
| "grad_norm": 27.375, | |
| "learning_rate": 5.09556135770235e-06, | |
| "loss": 0.9374, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.09212539394181529, | |
| "grad_norm": 42.0, | |
| "learning_rate": 5.11644908616188e-06, | |
| "loss": 0.9305, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.09250141595790433, | |
| "grad_norm": 27.875, | |
| "learning_rate": 5.137336814621411e-06, | |
| "loss": 0.9216, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.09287743797399338, | |
| "grad_norm": 54.0, | |
| "learning_rate": 5.1582245430809406e-06, | |
| "loss": 0.9187, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.09325345999008242, | |
| "grad_norm": 33.5, | |
| "learning_rate": 5.179112271540471e-06, | |
| "loss": 0.9057, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.09362948200617147, | |
| "grad_norm": 34.5, | |
| "learning_rate": 5.2e-06, | |
| "loss": 0.8961, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.0940055040222605, | |
| "grad_norm": 33.75, | |
| "learning_rate": 5.2208877284595304e-06, | |
| "loss": 0.9232, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.09438152603834954, | |
| "grad_norm": 21.375, | |
| "learning_rate": 5.241775456919061e-06, | |
| "loss": 0.9002, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.09475754805443859, | |
| "grad_norm": 37.0, | |
| "learning_rate": 5.26266318537859e-06, | |
| "loss": 0.9141, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.09513357007052763, | |
| "grad_norm": 31.75, | |
| "learning_rate": 5.28355091383812e-06, | |
| "loss": 0.9062, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.09550959208661668, | |
| "grad_norm": 25.0, | |
| "learning_rate": 5.3044386422976505e-06, | |
| "loss": 0.8911, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.09588561410270571, | |
| "grad_norm": 39.75, | |
| "learning_rate": 5.32532637075718e-06, | |
| "loss": 0.8961, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.09626163611879475, | |
| "grad_norm": 21.375, | |
| "learning_rate": 5.346214099216711e-06, | |
| "loss": 0.8682, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.0966376581348838, | |
| "grad_norm": 23.625, | |
| "learning_rate": 5.367101827676241e-06, | |
| "loss": 0.8722, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.09701368015097284, | |
| "grad_norm": 32.75, | |
| "learning_rate": 5.387989556135771e-06, | |
| "loss": 0.8664, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.09738970216706189, | |
| "grad_norm": 22.875, | |
| "learning_rate": 5.408877284595301e-06, | |
| "loss": 0.8557, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.09776572418315092, | |
| "grad_norm": 43.5, | |
| "learning_rate": 5.429765013054831e-06, | |
| "loss": 0.8546, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.09814174619923996, | |
| "grad_norm": 22.375, | |
| "learning_rate": 5.4506527415143605e-06, | |
| "loss": 0.8568, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.09851776821532901, | |
| "grad_norm": 24.75, | |
| "learning_rate": 5.471540469973891e-06, | |
| "loss": 0.8628, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.09889379023141805, | |
| "grad_norm": 23.75, | |
| "learning_rate": 5.49242819843342e-06, | |
| "loss": 0.8456, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.0992698122475071, | |
| "grad_norm": 23.5, | |
| "learning_rate": 5.51331592689295e-06, | |
| "loss": 0.8357, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.09964583426359613, | |
| "grad_norm": 14.9375, | |
| "learning_rate": 5.5342036553524814e-06, | |
| "loss": 0.8189, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.10002185627968517, | |
| "grad_norm": 40.25, | |
| "learning_rate": 5.555091383812012e-06, | |
| "loss": 0.8384, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.10039787829577422, | |
| "grad_norm": 48.0, | |
| "learning_rate": 5.575979112271541e-06, | |
| "loss": 0.8441, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.10077390031186326, | |
| "grad_norm": 25.0, | |
| "learning_rate": 5.596866840731071e-06, | |
| "loss": 0.81, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.10114992232795231, | |
| "grad_norm": 53.25, | |
| "learning_rate": 5.617754569190601e-06, | |
| "loss": 0.846, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.10152594434404134, | |
| "grad_norm": 21.25, | |
| "learning_rate": 5.638642297650131e-06, | |
| "loss": 0.8235, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.10190196636013038, | |
| "grad_norm": 22.75, | |
| "learning_rate": 5.659530026109661e-06, | |
| "loss": 0.8416, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.10227798837621943, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 5.6804177545691906e-06, | |
| "loss": 0.8025, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.10265401039230847, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 5.701305483028721e-06, | |
| "loss": 0.8071, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.10303003240839752, | |
| "grad_norm": 13.0, | |
| "learning_rate": 5.72219321148825e-06, | |
| "loss": 0.819, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.10340605442448655, | |
| "grad_norm": 36.25, | |
| "learning_rate": 5.743080939947781e-06, | |
| "loss": 0.809, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.10378207644057559, | |
| "grad_norm": 22.0, | |
| "learning_rate": 5.7639686684073115e-06, | |
| "loss": 0.8222, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.10415809845666464, | |
| "grad_norm": 20.375, | |
| "learning_rate": 5.784856396866842e-06, | |
| "loss": 0.7892, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.10453412047275368, | |
| "grad_norm": 23.875, | |
| "learning_rate": 5.805744125326371e-06, | |
| "loss": 0.8109, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.10491014248884273, | |
| "grad_norm": 43.5, | |
| "learning_rate": 5.826631853785901e-06, | |
| "loss": 0.7978, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.10528616450493176, | |
| "grad_norm": 22.25, | |
| "learning_rate": 5.847519582245431e-06, | |
| "loss": 0.7947, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.1056621865210208, | |
| "grad_norm": 9.5, | |
| "learning_rate": 5.868407310704961e-06, | |
| "loss": 0.8045, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.10603820853710985, | |
| "grad_norm": 12.375, | |
| "learning_rate": 5.889295039164491e-06, | |
| "loss": 0.8083, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.10641423055319889, | |
| "grad_norm": 30.125, | |
| "learning_rate": 5.910182767624021e-06, | |
| "loss": 0.8052, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.10679025256928794, | |
| "grad_norm": 20.125, | |
| "learning_rate": 5.931070496083552e-06, | |
| "loss": 0.7854, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.10716627458537697, | |
| "grad_norm": 15.25, | |
| "learning_rate": 5.951958224543082e-06, | |
| "loss": 0.7947, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.10754229660146601, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 5.972845953002611e-06, | |
| "loss": 0.7992, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.10791831861755506, | |
| "grad_norm": 20.625, | |
| "learning_rate": 5.993733681462142e-06, | |
| "loss": 0.7855, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.1082943406336441, | |
| "grad_norm": 10.125, | |
| "learning_rate": 6.014621409921672e-06, | |
| "loss": 0.7857, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.10867036264973315, | |
| "grad_norm": 33.5, | |
| "learning_rate": 6.035509138381201e-06, | |
| "loss": 0.7908, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.10904638466582219, | |
| "grad_norm": 12.75, | |
| "learning_rate": 6.0563968668407315e-06, | |
| "loss": 0.771, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.10942240668191122, | |
| "grad_norm": 23.5, | |
| "learning_rate": 6.077284595300262e-06, | |
| "loss": 0.7734, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.10979842869800027, | |
| "grad_norm": 12.125, | |
| "learning_rate": 6.098172323759791e-06, | |
| "loss": 0.7723, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.11017445071408931, | |
| "grad_norm": 14.0, | |
| "learning_rate": 6.119060052219322e-06, | |
| "loss": 0.7602, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.11055047273017835, | |
| "grad_norm": 32.0, | |
| "learning_rate": 6.139947780678852e-06, | |
| "loss": 0.7809, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.1109264947462674, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 6.160835509138382e-06, | |
| "loss": 0.7905, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.11130251676235643, | |
| "grad_norm": 24.875, | |
| "learning_rate": 6.181723237597912e-06, | |
| "loss": 0.7764, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.11167853877844548, | |
| "grad_norm": 19.625, | |
| "learning_rate": 6.202610966057441e-06, | |
| "loss": 0.7877, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.11205456079453452, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 6.223498694516972e-06, | |
| "loss": 0.7779, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.11243058281062356, | |
| "grad_norm": 15.9375, | |
| "learning_rate": 6.244386422976502e-06, | |
| "loss": 0.7711, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.1128066048267126, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 6.265274151436031e-06, | |
| "loss": 0.7661, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.11318262684280164, | |
| "grad_norm": 7.75, | |
| "learning_rate": 6.2861618798955615e-06, | |
| "loss": 0.76, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.11355864885889069, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 6.307049608355092e-06, | |
| "loss": 0.7445, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.11393467087497973, | |
| "grad_norm": 12.125, | |
| "learning_rate": 6.327937336814622e-06, | |
| "loss": 0.7601, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.11431069289106877, | |
| "grad_norm": 10.5, | |
| "learning_rate": 6.348825065274152e-06, | |
| "loss": 0.7641, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.11468671490715782, | |
| "grad_norm": 26.0, | |
| "learning_rate": 6.3697127937336825e-06, | |
| "loss": 0.7501, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.11506273692324685, | |
| "grad_norm": 18.625, | |
| "learning_rate": 6.390600522193212e-06, | |
| "loss": 0.7625, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.1154387589393359, | |
| "grad_norm": 12.375, | |
| "learning_rate": 6.411488250652742e-06, | |
| "loss": 0.752, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.11581478095542494, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 6.432375979112272e-06, | |
| "loss": 0.7546, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.11619080297151398, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 6.453263707571802e-06, | |
| "loss": 0.7533, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.11656682498760303, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 6.474151436031332e-06, | |
| "loss": 0.7427, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.11694284700369206, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 6.495039164490861e-06, | |
| "loss": 0.7569, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.11731886901978111, | |
| "grad_norm": 11.5, | |
| "learning_rate": 6.5159268929503924e-06, | |
| "loss": 0.7568, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.11769489103587015, | |
| "grad_norm": 9.125, | |
| "learning_rate": 6.536814621409923e-06, | |
| "loss": 0.7583, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.11807091305195919, | |
| "grad_norm": 6.0, | |
| "learning_rate": 6.557702349869453e-06, | |
| "loss": 0.745, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.11844693506804824, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 6.578590078328982e-06, | |
| "loss": 0.7423, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.11882295708413727, | |
| "grad_norm": 17.375, | |
| "learning_rate": 6.5994778067885125e-06, | |
| "loss": 0.7417, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.11919897910022632, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 6.620365535248042e-06, | |
| "loss": 0.7414, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.11957500111631536, | |
| "grad_norm": 18.375, | |
| "learning_rate": 6.641253263707572e-06, | |
| "loss": 0.7489, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.1199510231324044, | |
| "grad_norm": 18.75, | |
| "learning_rate": 6.662140992167102e-06, | |
| "loss": 0.748, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.12032704514849345, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 6.683028720626632e-06, | |
| "loss": 0.7259, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.12070306716458248, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 6.703916449086162e-06, | |
| "loss": 0.7454, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.12107908918067153, | |
| "grad_norm": 12.0, | |
| "learning_rate": 6.724804177545693e-06, | |
| "loss": 0.7378, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.12145511119676057, | |
| "grad_norm": 10.375, | |
| "learning_rate": 6.7456919060052225e-06, | |
| "loss": 0.7508, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.12183113321284961, | |
| "grad_norm": 8.875, | |
| "learning_rate": 6.766579634464753e-06, | |
| "loss": 0.7262, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.12220715522893866, | |
| "grad_norm": 10.375, | |
| "learning_rate": 6.787467362924283e-06, | |
| "loss": 0.7445, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.1225831772450277, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 6.808355091383812e-06, | |
| "loss": 0.7337, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.12295919926111674, | |
| "grad_norm": 20.375, | |
| "learning_rate": 6.829242819843343e-06, | |
| "loss": 0.7305, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.12333522127720578, | |
| "grad_norm": 6.875, | |
| "learning_rate": 6.850130548302872e-06, | |
| "loss": 0.7247, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.12371124329329482, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 6.871018276762402e-06, | |
| "loss": 0.7185, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.12408726530938387, | |
| "grad_norm": 8.125, | |
| "learning_rate": 6.8919060052219325e-06, | |
| "loss": 0.7359, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.1244632873254729, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 6.9127937336814636e-06, | |
| "loss": 0.7158, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.12483930934156195, | |
| "grad_norm": 17.75, | |
| "learning_rate": 6.933681462140993e-06, | |
| "loss": 0.7367, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.12521533135765098, | |
| "grad_norm": 9.375, | |
| "learning_rate": 6.954569190600523e-06, | |
| "loss": 0.718, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 0.12559135337374003, | |
| "grad_norm": 8.375, | |
| "learning_rate": 6.975456919060053e-06, | |
| "loss": 0.7174, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 0.12596737538982908, | |
| "grad_norm": 6.90625, | |
| "learning_rate": 6.996344647519583e-06, | |
| "loss": 0.7147, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.12634339740591813, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 7.017232375979113e-06, | |
| "loss": 0.7146, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 0.12671941942200715, | |
| "grad_norm": 9.0, | |
| "learning_rate": 7.0381201044386425e-06, | |
| "loss": 0.7205, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 0.1270954414380962, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 7.059007832898173e-06, | |
| "loss": 0.715, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 0.12747146345418525, | |
| "grad_norm": 7.375, | |
| "learning_rate": 7.079895561357703e-06, | |
| "loss": 0.7167, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 0.12784748547027427, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 7.100783289817232e-06, | |
| "loss": 0.7125, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.12822350748636333, | |
| "grad_norm": 6.5625, | |
| "learning_rate": 7.121671018276763e-06, | |
| "loss": 0.7231, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 0.12859952950245238, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 7.142558746736294e-06, | |
| "loss": 0.7154, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 0.1289755515185414, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 7.163446475195823e-06, | |
| "loss": 0.7118, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 0.12935157353463045, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 7.184334203655353e-06, | |
| "loss": 0.6987, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 0.1297275955507195, | |
| "grad_norm": 6.375, | |
| "learning_rate": 7.205221932114883e-06, | |
| "loss": 0.7044, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.13010361756680855, | |
| "grad_norm": 6.6875, | |
| "learning_rate": 7.226109660574413e-06, | |
| "loss": 0.6975, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 0.13047963958289757, | |
| "grad_norm": 5.875, | |
| "learning_rate": 7.246997389033943e-06, | |
| "loss": 0.7044, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 0.13085566159898662, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 7.2678851174934725e-06, | |
| "loss": 0.6952, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 0.13123168361507567, | |
| "grad_norm": 5.25, | |
| "learning_rate": 7.288772845953003e-06, | |
| "loss": 0.7118, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 0.1316077056311647, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 7.309660574412534e-06, | |
| "loss": 0.713, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.13198372764725375, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 7.330548302872063e-06, | |
| "loss": 0.7068, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 0.1323597496633428, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 7.3514360313315935e-06, | |
| "loss": 0.7073, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 0.13273577167943182, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 7.372323759791124e-06, | |
| "loss": 0.7097, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 0.13311179369552087, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 7.393211488250653e-06, | |
| "loss": 0.6954, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 0.13348781571160992, | |
| "grad_norm": 5.0, | |
| "learning_rate": 7.414099216710183e-06, | |
| "loss": 0.7081, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.13386383772769897, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 7.4349869451697136e-06, | |
| "loss": 0.702, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 0.134239859743788, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 7.455874673629243e-06, | |
| "loss": 0.7005, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 0.13461588175987704, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 7.476762402088773e-06, | |
| "loss": 0.6976, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 0.1349919037759661, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 7.497650130548304e-06, | |
| "loss": 0.6871, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 0.13536792579205512, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 7.518537859007834e-06, | |
| "loss": 0.6885, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.13574394780814417, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 7.539425587467364e-06, | |
| "loss": 0.6964, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 0.13611996982423322, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 7.560313315926894e-06, | |
| "loss": 0.7011, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 0.13649599184032224, | |
| "grad_norm": 8.75, | |
| "learning_rate": 7.5812010443864235e-06, | |
| "loss": 0.687, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 0.1368720138564113, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 7.602088772845954e-06, | |
| "loss": 0.6915, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 0.13724803587250034, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 7.622976501305483e-06, | |
| "loss": 0.6777, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.1376240578885894, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 7.643864229765013e-06, | |
| "loss": 0.6925, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 0.1380000799046784, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 7.664751958224544e-06, | |
| "loss": 0.6797, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 0.13837610192076746, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 7.685639686684074e-06, | |
| "loss": 0.6898, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 0.13875212393685651, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 7.706527415143604e-06, | |
| "loss": 0.689, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 0.13912814595294554, | |
| "grad_norm": 4.0, | |
| "learning_rate": 7.727415143603134e-06, | |
| "loss": 0.6946, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.1395041679690346, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 7.748302872062665e-06, | |
| "loss": 0.6817, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 0.13988018998512364, | |
| "grad_norm": 6.03125, | |
| "learning_rate": 7.769190600522193e-06, | |
| "loss": 0.6864, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 0.14025621200121266, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 7.790078328981723e-06, | |
| "loss": 0.6869, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 0.1406322340173017, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 7.810966057441254e-06, | |
| "loss": 0.6908, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 0.14100825603339076, | |
| "grad_norm": 8.625, | |
| "learning_rate": 7.831853785900784e-06, | |
| "loss": 0.6784, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.14138427804947978, | |
| "grad_norm": 3.25, | |
| "learning_rate": 7.852741514360314e-06, | |
| "loss": 0.6762, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 0.14176030006556883, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 7.873629242819844e-06, | |
| "loss": 0.6726, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 0.14213632208165788, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 7.894516971279375e-06, | |
| "loss": 0.6635, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 0.14251234409774693, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 7.915404699738905e-06, | |
| "loss": 0.6875, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 0.14288836611383596, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 7.936292428198435e-06, | |
| "loss": 0.6747, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.143264388129925, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 7.957180156657964e-06, | |
| "loss": 0.6749, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 0.14364041014601406, | |
| "grad_norm": 4.125, | |
| "learning_rate": 7.978067885117494e-06, | |
| "loss": 0.6701, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 0.14401643216210308, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 7.998955613577024e-06, | |
| "loss": 0.6641, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 0.14439245417819213, | |
| "grad_norm": 10.75, | |
| "learning_rate": 8.019843342036554e-06, | |
| "loss": 0.661, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 0.14476847619428118, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 8.040731070496085e-06, | |
| "loss": 0.6687, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.1451444982103702, | |
| "grad_norm": 4.875, | |
| "learning_rate": 8.061618798955613e-06, | |
| "loss": 0.6559, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 0.14552052022645925, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 8.082506527415143e-06, | |
| "loss": 0.6601, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 0.1458965422425483, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 8.103394255874675e-06, | |
| "loss": 0.6761, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 0.14627256425863736, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 8.124281984334205e-06, | |
| "loss": 0.6663, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 0.14664858627472638, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 8.145169712793734e-06, | |
| "loss": 0.6693, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.14702460829081543, | |
| "grad_norm": 7.25, | |
| "learning_rate": 8.166057441253264e-06, | |
| "loss": 0.6543, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 0.14740063030690448, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 8.186945169712795e-06, | |
| "loss": 0.6718, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 0.1477766523229935, | |
| "grad_norm": 4.5, | |
| "learning_rate": 8.207832898172325e-06, | |
| "loss": 0.6534, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 0.14815267433908255, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 8.228720626631855e-06, | |
| "loss": 0.6576, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 0.1485286963551716, | |
| "grad_norm": 3.0, | |
| "learning_rate": 8.249608355091384e-06, | |
| "loss": 0.646, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.14890471837126062, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 8.270496083550914e-06, | |
| "loss": 0.6576, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 0.14928074038734968, | |
| "grad_norm": 4.875, | |
| "learning_rate": 8.291383812010446e-06, | |
| "loss": 0.6728, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 0.14965676240343873, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 8.312271540469974e-06, | |
| "loss": 0.6676, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 0.15003278441952778, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 8.333159268929504e-06, | |
| "loss": 0.6537, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 0.1504088064356168, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 8.354046997389035e-06, | |
| "loss": 0.6551, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.15078482845170585, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 8.374934725848565e-06, | |
| "loss": 0.6531, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 0.1511608504677949, | |
| "grad_norm": 4.0, | |
| "learning_rate": 8.395822454308095e-06, | |
| "loss": 0.654, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 0.15153687248388392, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 8.416710182767624e-06, | |
| "loss": 0.658, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 0.15191289449997297, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 8.437597911227154e-06, | |
| "loss": 0.6516, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 0.15228891651606202, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 8.458485639686684e-06, | |
| "loss": 0.6584, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.15266493853215105, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 8.479373368146214e-06, | |
| "loss": 0.6544, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 0.1530409605482401, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 8.500261096605745e-06, | |
| "loss": 0.6579, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 0.15341698256432915, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 8.521148825065275e-06, | |
| "loss": 0.6413, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 0.1537930045804182, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 8.542036553524805e-06, | |
| "loss": 0.6537, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 0.15416902659650722, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 8.562924281984335e-06, | |
| "loss": 0.6568, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.15454504861259627, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 8.583812010443866e-06, | |
| "loss": 0.636, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 0.15492107062868532, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.604699738903394e-06, | |
| "loss": 0.6439, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 0.15529709264477434, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 8.625587467362924e-06, | |
| "loss": 0.6409, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 0.1556731146608634, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 8.646475195822455e-06, | |
| "loss": 0.6533, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 0.15604913667695244, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 8.667362924281985e-06, | |
| "loss": 0.648, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.15642515869304147, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 8.688250652741515e-06, | |
| "loss": 0.6388, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 0.15680118070913052, | |
| "grad_norm": 5.125, | |
| "learning_rate": 8.709138381201045e-06, | |
| "loss": 0.6439, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 0.15717720272521957, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 8.730026109660576e-06, | |
| "loss": 0.6397, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 0.15755322474130862, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 8.750913838120106e-06, | |
| "loss": 0.6342, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 0.15792924675739764, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 8.771801566579634e-06, | |
| "loss": 0.6388, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.1583052687734867, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 8.792689295039165e-06, | |
| "loss": 0.6361, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 0.15868129078957574, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 8.813577023498695e-06, | |
| "loss": 0.6334, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 0.15905731280566476, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 8.834464751958225e-06, | |
| "loss": 0.6556, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 0.1594333348217538, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 8.855352480417755e-06, | |
| "loss": 0.6357, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 0.15980935683784286, | |
| "grad_norm": 3.0, | |
| "learning_rate": 8.876240208877286e-06, | |
| "loss": 0.6383, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.1601853788539319, | |
| "grad_norm": 2.625, | |
| "learning_rate": 8.897127937336816e-06, | |
| "loss": 0.6429, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 0.16056140087002094, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 8.918015665796346e-06, | |
| "loss": 0.6338, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 0.16093742288611, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.938903394255876e-06, | |
| "loss": 0.649, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 0.161313444902199, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 8.959791122715405e-06, | |
| "loss": 0.6272, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 0.16168946691828806, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 8.980678851174935e-06, | |
| "loss": 0.6372, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.1620654889343771, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 9.001566579634465e-06, | |
| "loss": 0.6178, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 0.16244151095046616, | |
| "grad_norm": 4.875, | |
| "learning_rate": 9.022454308093996e-06, | |
| "loss": 0.6354, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 0.16281753296655518, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 9.043342036553526e-06, | |
| "loss": 0.6427, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 0.16319355498264423, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 9.064229765013054e-06, | |
| "loss": 0.6283, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 0.16356957699873328, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 9.085117493472586e-06, | |
| "loss": 0.6364, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.1639455990148223, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 9.106005221932116e-06, | |
| "loss": 0.6289, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 0.16432162103091136, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 9.126892950391647e-06, | |
| "loss": 0.6357, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 0.1646976430470004, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 9.147780678851175e-06, | |
| "loss": 0.6392, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 0.16507366506308943, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 9.168668407310705e-06, | |
| "loss": 0.6211, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 0.16544968707917848, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 9.189556135770236e-06, | |
| "loss": 0.6395, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.16582570909526753, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 9.210443864229766e-06, | |
| "loss": 0.625, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 0.16620173111135658, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 9.231331592689296e-06, | |
| "loss": 0.6307, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 0.1665777531274456, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.252219321148825e-06, | |
| "loss": 0.6201, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 0.16695377514353466, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 9.273107049608357e-06, | |
| "loss": 0.6211, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 0.1673297971596237, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 9.293994778067887e-06, | |
| "loss": 0.6242, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.16770581917571273, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.314882506527415e-06, | |
| "loss": 0.6216, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 0.16808184119180178, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 9.335770234986946e-06, | |
| "loss": 0.6271, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 0.16845786320789083, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 9.356657963446476e-06, | |
| "loss": 0.6366, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 0.16883388522397985, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 9.377545691906006e-06, | |
| "loss": 0.6155, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 0.1692099072400689, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 9.398433420365536e-06, | |
| "loss": 0.6162, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.16958592925615795, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 9.419321148825065e-06, | |
| "loss": 0.6269, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 0.169961951272247, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 9.440208877284595e-06, | |
| "loss": 0.6306, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 0.17033797328833603, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 9.461096605744125e-06, | |
| "loss": 0.6247, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 0.17071399530442508, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 9.481984334203657e-06, | |
| "loss": 0.613, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 0.17109001732051413, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 9.502872062663186e-06, | |
| "loss": 0.6187, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.17146603933660315, | |
| "grad_norm": 2.875, | |
| "learning_rate": 9.523759791122716e-06, | |
| "loss": 0.6215, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 0.1718420613526922, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 9.544647519582246e-06, | |
| "loss": 0.6234, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 0.17221808336878125, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 9.565535248041777e-06, | |
| "loss": 0.618, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 0.17259410538487027, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 9.586422976501307e-06, | |
| "loss": 0.6134, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 0.17297012740095932, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 9.607310704960835e-06, | |
| "loss": 0.6127, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.17334614941704837, | |
| "grad_norm": 2.625, | |
| "learning_rate": 9.628198433420366e-06, | |
| "loss": 0.6139, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 0.17372217143313742, | |
| "grad_norm": 2.875, | |
| "learning_rate": 9.649086161879896e-06, | |
| "loss": 0.6121, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 0.17409819344922645, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 9.669973890339426e-06, | |
| "loss": 0.6126, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 0.1744742154653155, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.690861618798956e-06, | |
| "loss": 0.6162, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 0.17485023748140455, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 9.711749347258487e-06, | |
| "loss": 0.6072, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.17522625949749357, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 9.732637075718017e-06, | |
| "loss": 0.6122, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 0.17560228151358262, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 9.753524804177547e-06, | |
| "loss": 0.6053, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 0.17597830352967167, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 9.774412532637077e-06, | |
| "loss": 0.6149, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 0.1763543255457607, | |
| "grad_norm": 1.5, | |
| "learning_rate": 9.795300261096606e-06, | |
| "loss": 0.6229, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 0.17673034756184974, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 9.816187989556136e-06, | |
| "loss": 0.6134, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.1771063695779388, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 9.837075718015666e-06, | |
| "loss": 0.6155, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 0.17748239159402784, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 9.857963446475197e-06, | |
| "loss": 0.6042, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 0.17785841361011687, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 9.878851174934727e-06, | |
| "loss": 0.6182, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 0.17823443562620592, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 9.899738903394257e-06, | |
| "loss": 0.6036, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 0.17861045764229497, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 9.920626631853787e-06, | |
| "loss": 0.6027, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.178986479658384, | |
| "grad_norm": 1.5, | |
| "learning_rate": 9.941514360313318e-06, | |
| "loss": 0.6089, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 0.17936250167447304, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 9.962402088772846e-06, | |
| "loss": 0.604, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 0.1797385236905621, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 9.983289817232376e-06, | |
| "loss": 0.6004, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 0.1801145457066511, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 9.999999995880232e-06, | |
| "loss": 0.6019, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 0.18049056772274016, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.999999851688318e-06, | |
| "loss": 0.6145, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.18086658973882921, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 9.999999501507959e-06, | |
| "loss": 0.6105, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 0.18124261175491824, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 9.999998945339171e-06, | |
| "loss": 0.6139, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 0.1816186337710073, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 9.999998183181976e-06, | |
| "loss": 0.6122, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 0.18199465578709634, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 9.999997215036408e-06, | |
| "loss": 0.6095, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 0.1823706778031854, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 9.999996040902503e-06, | |
| "loss": 0.5928, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.1827466998192744, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 9.999994660780312e-06, | |
| "loss": 0.6034, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 0.18312272183536346, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 9.99999307466989e-06, | |
| "loss": 0.6012, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 0.1834987438514525, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 9.999991282571304e-06, | |
| "loss": 0.605, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 0.18387476586754153, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.999989284484629e-06, | |
| "loss": 0.6093, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 0.18425078788363058, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 9.999987080409942e-06, | |
| "loss": 0.6032, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.18462680989971963, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.99998467034734e-06, | |
| "loss": 0.6019, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 0.18500283191580866, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 9.99998205429692e-06, | |
| "loss": 0.6006, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 0.1853788539318977, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 9.999979232258787e-06, | |
| "loss": 0.586, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 0.18575487594798676, | |
| "grad_norm": 1.375, | |
| "learning_rate": 9.999976204233062e-06, | |
| "loss": 0.614, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 0.1861308979640758, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.999972970219865e-06, | |
| "loss": 0.6049, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.18650691998016483, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 9.999969530219333e-06, | |
| "loss": 0.6048, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 0.18688294199625388, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 9.999965884231607e-06, | |
| "loss": 0.608, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 0.18725896401234293, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 9.999962032256836e-06, | |
| "loss": 0.6017, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 0.18763498602843195, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.99995797429518e-06, | |
| "loss": 0.592, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 0.188011008044521, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.999953710346804e-06, | |
| "loss": 0.602, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.18838703006061006, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.999949240411886e-06, | |
| "loss": 0.5894, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 0.18876305207669908, | |
| "grad_norm": 1.375, | |
| "learning_rate": 9.99994456449061e-06, | |
| "loss": 0.5908, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 0.18913907409278813, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 9.999939682583166e-06, | |
| "loss": 0.5914, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 0.18951509610887718, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 9.999934594689759e-06, | |
| "loss": 0.5951, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 0.18989111812496623, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 9.999929300810595e-06, | |
| "loss": 0.5925, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.19026714014105525, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.999923800945895e-06, | |
| "loss": 0.5982, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 0.1906431621571443, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.999918095095884e-06, | |
| "loss": 0.6023, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 0.19101918417323335, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 9.999912183260798e-06, | |
| "loss": 0.5926, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 0.19139520618932238, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 9.999906065440878e-06, | |
| "loss": 0.5869, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 0.19177122820541143, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 9.999899741636381e-06, | |
| "loss": 0.5965, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.19214725022150048, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 9.999893211847563e-06, | |
| "loss": 0.601, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 0.1925232722375895, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 9.999886476074694e-06, | |
| "loss": 0.5916, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 0.19289929425367855, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.999879534318051e-06, | |
| "loss": 0.5947, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 0.1932753162697676, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 9.999872386577923e-06, | |
| "loss": 0.5979, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 0.19365133828585665, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 9.9998650328546e-06, | |
| "loss": 0.5927, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.19402736030194567, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 9.99985747314839e-06, | |
| "loss": 0.5999, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 0.19440338231803472, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.999849707459601e-06, | |
| "loss": 0.6072, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 0.19477940433412377, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 9.999841735788555e-06, | |
| "loss": 0.601, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 0.1951554263502128, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 9.999833558135578e-06, | |
| "loss": 0.5996, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 0.19553144836630185, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 9.999825174501009e-06, | |
| "loss": 0.5907, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.1959074703823909, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.999816584885192e-06, | |
| "loss": 0.5888, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 0.19628349239847992, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 9.99980778928848e-06, | |
| "loss": 0.5894, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 0.19665951441456897, | |
| "grad_norm": 2.25, | |
| "learning_rate": 9.999798787711239e-06, | |
| "loss": 0.5938, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 0.19703553643065802, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 9.999789580153835e-06, | |
| "loss": 0.5832, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 0.19741155844674707, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.999780166616652e-06, | |
| "loss": 0.579, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.1977875804628361, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 9.999770547100073e-06, | |
| "loss": 0.596, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 0.19816360247892514, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 9.9997607216045e-06, | |
| "loss": 0.5934, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 0.1985396244950142, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 9.999750690130335e-06, | |
| "loss": 0.5884, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 0.19891564651110322, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 9.99974045267799e-06, | |
| "loss": 0.5949, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 0.19929166852719227, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.999730009247888e-06, | |
| "loss": 0.5935, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.19966769054328132, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.999719359840459e-06, | |
| "loss": 0.5904, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 0.20004371255937034, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 9.99970850445614e-06, | |
| "loss": 0.5811, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 0.2004197345754594, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 9.999697443095383e-06, | |
| "loss": 0.584, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 0.20079575659154844, | |
| "grad_norm": 1.125, | |
| "learning_rate": 9.999686175758639e-06, | |
| "loss": 0.586, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 0.20117177860763746, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 9.999674702446375e-06, | |
| "loss": 0.5924, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.20154780062372651, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 9.999663023159062e-06, | |
| "loss": 0.5876, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 0.20192382263981556, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 9.999651137897182e-06, | |
| "loss": 0.5857, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 0.20229984465590461, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 9.999639046661226e-06, | |
| "loss": 0.5847, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 0.20267586667199364, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.999626749451688e-06, | |
| "loss": 0.5865, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 0.2030518886880827, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 9.999614246269076e-06, | |
| "loss": 0.5876, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.20342791070417174, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 9.999601537113908e-06, | |
| "loss": 0.5751, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 0.20380393272026076, | |
| "grad_norm": 1.125, | |
| "learning_rate": 9.999588621986707e-06, | |
| "loss": 0.5764, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 0.2041799547363498, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 9.999575500888004e-06, | |
| "loss": 0.5752, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 0.20455597675243886, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 9.999562173818338e-06, | |
| "loss": 0.5858, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 0.20493199876852788, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.999548640778259e-06, | |
| "loss": 0.5932, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.20530802078461693, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 9.999534901768326e-06, | |
| "loss": 0.5797, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 0.20568404280070599, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 9.999520956789104e-06, | |
| "loss": 0.5839, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 0.20606006481679504, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 9.999506805841169e-06, | |
| "loss": 0.5883, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 0.20643608683288406, | |
| "grad_norm": 1.125, | |
| "learning_rate": 9.999492448925102e-06, | |
| "loss": 0.5793, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 0.2068121088489731, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.999477886041493e-06, | |
| "loss": 0.5795, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.20718813086506216, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 9.999463117190945e-06, | |
| "loss": 0.5798, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 0.20756415288115118, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 9.999448142374066e-06, | |
| "loss": 0.5855, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 0.20794017489724023, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 9.999432961591472e-06, | |
| "loss": 0.6086, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 0.20831619691332928, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.999417574843788e-06, | |
| "loss": 0.5777, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 0.2086922189294183, | |
| "grad_norm": 1.0, | |
| "learning_rate": 9.99940198213165e-06, | |
| "loss": 0.5858, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.20906824094550736, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 9.9993861834557e-06, | |
| "loss": 0.5761, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 0.2094442629615964, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 9.999370178816586e-06, | |
| "loss": 0.5777, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 0.20982028497768546, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 9.999353968214969e-06, | |
| "loss": 0.5853, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 0.21019630699377448, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 9.999337551651517e-06, | |
| "loss": 0.5951, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 0.21057232900986353, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 9.999320929126909e-06, | |
| "loss": 0.5874, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.21094835102595258, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 9.999304100641824e-06, | |
| "loss": 0.5924, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 0.2113243730420416, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 9.99928706619696e-06, | |
| "loss": 0.5927, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 0.21170039505813065, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 9.999269825793018e-06, | |
| "loss": 0.5941, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 0.2120764170742197, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 9.999252379430707e-06, | |
| "loss": 0.5873, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 0.21245243909030873, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 9.999234727110746e-06, | |
| "loss": 0.586, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.21282846110639778, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 9.999216868833864e-06, | |
| "loss": 0.5901, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 0.21320448312248683, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 9.999198804600793e-06, | |
| "loss": 0.5738, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 0.21358050513857588, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.999180534412281e-06, | |
| "loss": 0.5837, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 0.2139565271546649, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 9.999162058269079e-06, | |
| "loss": 0.58, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 0.21433254917075395, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 9.99914337617195e-06, | |
| "loss": 0.5803, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.214708571186843, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.999124488121658e-06, | |
| "loss": 0.5759, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 0.21508459320293202, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 9.999105394118988e-06, | |
| "loss": 0.5867, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 0.21546061521902107, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 9.999086094164724e-06, | |
| "loss": 0.5784, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 0.21583663723511012, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 9.99906658825966e-06, | |
| "loss": 0.5796, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 0.21621265925119915, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 9.999046876404602e-06, | |
| "loss": 0.5758, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.2165886812672882, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 9.999026958600358e-06, | |
| "loss": 0.5852, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 0.21696470328337725, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 9.999006834847752e-06, | |
| "loss": 0.576, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 0.2173407252994663, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 9.998986505147612e-06, | |
| "loss": 0.5848, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 0.21771674731555532, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 9.998965969500779e-06, | |
| "loss": 0.5871, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 0.21809276933164437, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 9.99894522790809e-06, | |
| "loss": 0.5829, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.21846879134773342, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 9.99892428037041e-06, | |
| "loss": 0.5742, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 0.21884481336382244, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 9.998903126888595e-06, | |
| "loss": 0.5841, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 0.2192208353799115, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.998881767463519e-06, | |
| "loss": 0.5819, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 0.21959685739600054, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 9.998860202096063e-06, | |
| "loss": 0.5805, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 0.21997287941208957, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 9.998838430787112e-06, | |
| "loss": 0.5785, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.22034890142817862, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 9.998816453537568e-06, | |
| "loss": 0.5804, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 0.22072492344426767, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 9.998794270348331e-06, | |
| "loss": 0.5854, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 0.2211009454603567, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 9.998771881220319e-06, | |
| "loss": 0.5857, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 0.22147696747644574, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 9.99874928615445e-06, | |
| "loss": 0.5855, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 0.2218529894925348, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 9.99872648515166e-06, | |
| "loss": 0.5736, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.22222901150862384, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 9.998703478212885e-06, | |
| "loss": 0.5792, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 0.22260503352471286, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 9.998680265339076e-06, | |
| "loss": 0.5709, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 0.22298105554080191, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 9.998656846531185e-06, | |
| "loss": 0.5717, | |
| "step": 11860 | |
| }, | |
| { | |
| "epoch": 0.22335707755689096, | |
| "grad_norm": 1.0, | |
| "learning_rate": 9.99863322179018e-06, | |
| "loss": 0.5719, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 0.22373309957298, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 9.99860939111703e-06, | |
| "loss": 0.5874, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.22410912158906904, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 9.998585354512725e-06, | |
| "loss": 0.5723, | |
| "step": 11920 | |
| }, | |
| { | |
| "epoch": 0.2244851436051581, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 9.998561111978246e-06, | |
| "loss": 0.5899, | |
| "step": 11940 | |
| }, | |
| { | |
| "epoch": 0.2248611656212471, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 9.998536663514599e-06, | |
| "loss": 0.5824, | |
| "step": 11960 | |
| }, | |
| { | |
| "epoch": 0.22523718763733616, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 9.998512009122787e-06, | |
| "loss": 0.5668, | |
| "step": 11980 | |
| }, | |
| { | |
| "epoch": 0.2256132096534252, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 9.998487148803826e-06, | |
| "loss": 0.5701, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.22598923166951426, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 9.998462082558741e-06, | |
| "loss": 0.576, | |
| "step": 12020 | |
| }, | |
| { | |
| "epoch": 0.22636525368560328, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 9.998436810388566e-06, | |
| "loss": 0.5761, | |
| "step": 12040 | |
| }, | |
| { | |
| "epoch": 0.22674127570169234, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 9.998411332294341e-06, | |
| "loss": 0.5786, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 0.22711729771778139, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 9.998385648277116e-06, | |
| "loss": 0.5758, | |
| "step": 12080 | |
| }, | |
| { | |
| "epoch": 0.2274933197338704, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 9.998359758337947e-06, | |
| "loss": 0.5769, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.22786934174995946, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 9.998333662477903e-06, | |
| "loss": 0.5666, | |
| "step": 12120 | |
| }, | |
| { | |
| "epoch": 0.2282453637660485, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 9.998307360698059e-06, | |
| "loss": 0.5754, | |
| "step": 12140 | |
| }, | |
| { | |
| "epoch": 0.22862138578213753, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 9.998280852999496e-06, | |
| "loss": 0.5627, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 0.22899740779822658, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 9.99825413938331e-06, | |
| "loss": 0.5797, | |
| "step": 12180 | |
| }, | |
| { | |
| "epoch": 0.22937342981431563, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 9.998227219850597e-06, | |
| "loss": 0.5875, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.22974945183040468, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 9.998200094402471e-06, | |
| "loss": 0.5809, | |
| "step": 12220 | |
| }, | |
| { | |
| "epoch": 0.2301254738464937, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 9.998172763040048e-06, | |
| "loss": 0.5714, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 0.23050149586258276, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.99814522576445e-06, | |
| "loss": 0.5755, | |
| "step": 12260 | |
| }, | |
| { | |
| "epoch": 0.2308775178786718, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 9.998117482576816e-06, | |
| "loss": 0.5764, | |
| "step": 12280 | |
| }, | |
| { | |
| "epoch": 0.23125353989476083, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 9.998089533478287e-06, | |
| "loss": 0.5699, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.23162956191084988, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 9.998061378470016e-06, | |
| "loss": 0.5814, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 0.23200558392693893, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 9.998033017553162e-06, | |
| "loss": 0.5776, | |
| "step": 12340 | |
| }, | |
| { | |
| "epoch": 0.23238160594302795, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 9.99800445072889e-06, | |
| "loss": 0.5776, | |
| "step": 12360 | |
| }, | |
| { | |
| "epoch": 0.232757627959117, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 9.997975677998385e-06, | |
| "loss": 0.574, | |
| "step": 12380 | |
| }, | |
| { | |
| "epoch": 0.23313364997520605, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 9.997946699362825e-06, | |
| "loss": 0.5668, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.2335096719912951, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 9.997917514823406e-06, | |
| "loss": 0.5711, | |
| "step": 12420 | |
| }, | |
| { | |
| "epoch": 0.23388569400738413, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 9.99788812438133e-06, | |
| "loss": 0.5556, | |
| "step": 12440 | |
| }, | |
| { | |
| "epoch": 0.23426171602347318, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 9.99785852803781e-06, | |
| "loss": 0.5841, | |
| "step": 12460 | |
| }, | |
| { | |
| "epoch": 0.23463773803956223, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 9.997828725794061e-06, | |
| "loss": 0.5763, | |
| "step": 12480 | |
| }, | |
| { | |
| "epoch": 0.23501376005565125, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 9.997798717651316e-06, | |
| "loss": 0.5698, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.2353897820717403, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 9.99776850361081e-06, | |
| "loss": 0.5708, | |
| "step": 12520 | |
| }, | |
| { | |
| "epoch": 0.23576580408782935, | |
| "grad_norm": 0.75, | |
| "learning_rate": 9.997738083673785e-06, | |
| "loss": 0.5727, | |
| "step": 12540 | |
| }, | |
| { | |
| "epoch": 0.23614182610391837, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 9.997707457841496e-06, | |
| "loss": 0.5596, | |
| "step": 12560 | |
| }, | |
| { | |
| "epoch": 0.23651784812000742, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.997676626115205e-06, | |
| "loss": 0.5688, | |
| "step": 12580 | |
| }, | |
| { | |
| "epoch": 0.23689387013609647, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.997645588496181e-06, | |
| "loss": 0.5598, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.23726989215218552, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 9.997614344985705e-06, | |
| "loss": 0.5573, | |
| "step": 12620 | |
| }, | |
| { | |
| "epoch": 0.23764591416827455, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 9.99758289558506e-06, | |
| "loss": 0.5708, | |
| "step": 12640 | |
| }, | |
| { | |
| "epoch": 0.2380219361843636, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 9.997551240295546e-06, | |
| "loss": 0.5752, | |
| "step": 12660 | |
| }, | |
| { | |
| "epoch": 0.23839795820045265, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 9.997519379118465e-06, | |
| "loss": 0.5741, | |
| "step": 12680 | |
| }, | |
| { | |
| "epoch": 0.23877398021654167, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 9.99748731205513e-06, | |
| "loss": 0.5625, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.23915000223263072, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.997455039106861e-06, | |
| "loss": 0.5751, | |
| "step": 12720 | |
| }, | |
| { | |
| "epoch": 0.23952602424871977, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 9.99742256027499e-06, | |
| "loss": 0.5627, | |
| "step": 12740 | |
| }, | |
| { | |
| "epoch": 0.2399020462648088, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 9.997389875560853e-06, | |
| "loss": 0.5675, | |
| "step": 12760 | |
| }, | |
| { | |
| "epoch": 0.24027806828089784, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 9.997356984965798e-06, | |
| "loss": 0.5751, | |
| "step": 12780 | |
| }, | |
| { | |
| "epoch": 0.2406540902969869, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 9.997323888491178e-06, | |
| "loss": 0.5762, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.24103011231307592, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 9.997290586138357e-06, | |
| "loss": 0.5744, | |
| "step": 12820 | |
| }, | |
| { | |
| "epoch": 0.24140613432916497, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 9.99725707790871e-06, | |
| "loss": 0.5676, | |
| "step": 12840 | |
| }, | |
| { | |
| "epoch": 0.24178215634525402, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 9.997223363803615e-06, | |
| "loss": 0.5817, | |
| "step": 12860 | |
| }, | |
| { | |
| "epoch": 0.24215817836134307, | |
| "grad_norm": 0.80859375, | |
| "learning_rate": 9.99718944382446e-06, | |
| "loss": 0.5763, | |
| "step": 12880 | |
| }, | |
| { | |
| "epoch": 0.2425342003774321, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 9.997155317972643e-06, | |
| "loss": 0.5745, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.24291022239352114, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 9.99712098624957e-06, | |
| "loss": 0.5663, | |
| "step": 12920 | |
| }, | |
| { | |
| "epoch": 0.2432862444096102, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 9.997086448656658e-06, | |
| "loss": 0.5695, | |
| "step": 12940 | |
| }, | |
| { | |
| "epoch": 0.24366226642569921, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 9.997051705195326e-06, | |
| "loss": 0.573, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 0.24403828844178826, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 9.997016755867008e-06, | |
| "loss": 0.5698, | |
| "step": 12980 | |
| }, | |
| { | |
| "epoch": 0.24441431045787732, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 9.996981600673144e-06, | |
| "loss": 0.5666, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.24479033247396634, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 9.99694623961518e-06, | |
| "loss": 0.5694, | |
| "step": 13020 | |
| }, | |
| { | |
| "epoch": 0.2451663544900554, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 9.996910672694573e-06, | |
| "loss": 0.5574, | |
| "step": 13040 | |
| }, | |
| { | |
| "epoch": 0.24554237650614444, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.99687489991279e-06, | |
| "loss": 0.5564, | |
| "step": 13060 | |
| }, | |
| { | |
| "epoch": 0.2459183985222335, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 9.996838921271304e-06, | |
| "loss": 0.5666, | |
| "step": 13080 | |
| }, | |
| { | |
| "epoch": 0.2462944205383225, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 9.996802736771597e-06, | |
| "loss": 0.5758, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.24667044255441156, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 9.99676634641516e-06, | |
| "loss": 0.5619, | |
| "step": 13120 | |
| }, | |
| { | |
| "epoch": 0.2470464645705006, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 9.996729750203493e-06, | |
| "loss": 0.5817, | |
| "step": 13140 | |
| }, | |
| { | |
| "epoch": 0.24742248658658964, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 9.996692948138102e-06, | |
| "loss": 0.5705, | |
| "step": 13160 | |
| }, | |
| { | |
| "epoch": 0.24779850860267869, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 9.996655940220504e-06, | |
| "loss": 0.5713, | |
| "step": 13180 | |
| }, | |
| { | |
| "epoch": 0.24817453061876774, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 9.996618726452223e-06, | |
| "loss": 0.5715, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.24855055263485676, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 9.996581306834793e-06, | |
| "loss": 0.5622, | |
| "step": 13220 | |
| }, | |
| { | |
| "epoch": 0.2489265746509458, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 9.996543681369756e-06, | |
| "loss": 0.5636, | |
| "step": 13240 | |
| }, | |
| { | |
| "epoch": 0.24930259666703486, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 9.996505850058663e-06, | |
| "loss": 0.5753, | |
| "step": 13260 | |
| }, | |
| { | |
| "epoch": 0.2496786186831239, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 9.996467812903067e-06, | |
| "loss": 0.5774, | |
| "step": 13280 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 319134, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 13297, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.9972288387703433e+20, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |