Invalid JSON:
Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 313, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 1.2938580513000488, | |
| "learning_rate": 0.0, | |
| "loss": 2.3978, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 1.3802108764648438, | |
| "learning_rate": 1.5625e-06, | |
| "loss": 2.1638, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 1.401850700378418, | |
| "learning_rate": 3.125e-06, | |
| "loss": 2.0659, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": NaN, | |
| "learning_rate": 4.6875000000000004e-06, | |
| "loss": 2.466, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 1.5032564401626587, | |
| "learning_rate": 4.6875000000000004e-06, | |
| "loss": 2.088, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 3.115506172180176, | |
| "learning_rate": 6.25e-06, | |
| "loss": 2.0598, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 1.3768525123596191, | |
| "learning_rate": 7.8125e-06, | |
| "loss": 2.2994, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 1.4846614599227905, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 2.0359, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 4.242325782775879, | |
| "learning_rate": 1.09375e-05, | |
| "loss": 1.9999, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 2.884045362472534, | |
| "learning_rate": 1.25e-05, | |
| "loss": 1.6543, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 1.1325325965881348, | |
| "learning_rate": 1.4062500000000001e-05, | |
| "loss": 2.5745, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 1.1092923879623413, | |
| "learning_rate": 1.5625e-05, | |
| "loss": 2.3241, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 1.071656346321106, | |
| "learning_rate": 1.71875e-05, | |
| "loss": 2.3016, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 1.2169474363327026, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 1.9022, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 1.1128144264221191, | |
| "learning_rate": 2.0312500000000002e-05, | |
| "loss": 2.0983, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 1.367555022239685, | |
| "learning_rate": 2.1875e-05, | |
| "loss": 2.3887, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 1.3732932806015015, | |
| "learning_rate": 2.34375e-05, | |
| "loss": 2.4338, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 1.0335572957992554, | |
| "learning_rate": 2.5e-05, | |
| "loss": 2.1441, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 1.057332158088684, | |
| "learning_rate": 2.6562500000000002e-05, | |
| "loss": 2.4553, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.9568037986755371, | |
| "learning_rate": 2.8125000000000003e-05, | |
| "loss": 2.1104, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 0.9306254982948303, | |
| "learning_rate": 2.96875e-05, | |
| "loss": 1.936, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 0.9852834343910217, | |
| "learning_rate": 3.125e-05, | |
| "loss": 2.3996, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 1.0083969831466675, | |
| "learning_rate": 3.2812500000000005e-05, | |
| "loss": 2.3265, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 2.5204765796661377, | |
| "learning_rate": 3.4375e-05, | |
| "loss": 2.3637, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.87235027551651, | |
| "learning_rate": 3.59375e-05, | |
| "loss": 2.049, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 1.0738499164581299, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 2.2154, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 1.1177905797958374, | |
| "learning_rate": 3.90625e-05, | |
| "loss": 2.2278, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 0.9642391800880432, | |
| "learning_rate": 4.0625000000000005e-05, | |
| "loss": 2.1245, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 0.9148507118225098, | |
| "learning_rate": 4.21875e-05, | |
| "loss": 2.3712, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.0361849069595337, | |
| "learning_rate": 4.375e-05, | |
| "loss": 2.1405, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 7.4032697677612305, | |
| "learning_rate": 4.5312500000000004e-05, | |
| "loss": 1.8717, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 1.011413812637329, | |
| "learning_rate": 4.6875e-05, | |
| "loss": 2.0154, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 1.0620793104171753, | |
| "learning_rate": 4.8437500000000005e-05, | |
| "loss": 2.2802, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 1.0524107217788696, | |
| "learning_rate": 5e-05, | |
| "loss": 2.1007, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.9986498355865479, | |
| "learning_rate": 4.999843759868819e-05, | |
| "loss": 2.0176, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 1.047672986984253, | |
| "learning_rate": 4.9993750590040575e-05, | |
| "loss": 2.3632, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 0.9666229486465454, | |
| "learning_rate": 4.998593955989626e-05, | |
| "loss": 2.0895, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 1.1995681524276733, | |
| "learning_rate": 4.9975005484572305e-05, | |
| "loss": 1.9112, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 3.1392664909362793, | |
| "learning_rate": 4.996094973074183e-05, | |
| "loss": 2.0765, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 1.1716688871383667, | |
| "learning_rate": 4.994377405526308e-05, | |
| "loss": 1.7757, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 0.9636698365211487, | |
| "learning_rate": 4.992348060495989e-05, | |
| "loss": 2.1896, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 0.9387962222099304, | |
| "learning_rate": 4.990007191635334e-05, | |
| "loss": 2.1875, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 0.918551504611969, | |
| "learning_rate": 4.987355091534468e-05, | |
| "loss": 2.1049, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 0.9996175169944763, | |
| "learning_rate": 4.9843920916849645e-05, | |
| "loss": 2.4059, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.9736499190330505, | |
| "learning_rate": 4.981118562438414e-05, | |
| "loss": 1.999, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 1.036577582359314, | |
| "learning_rate": 4.9775349129601243e-05, | |
| "loss": 2.0207, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 0.9663489460945129, | |
| "learning_rate": 4.973641591177991e-05, | |
| "loss": 1.8309, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 0.9678478837013245, | |
| "learning_rate": 4.969439083726496e-05, | |
| "loss": 1.8601, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 1.1323456764221191, | |
| "learning_rate": 4.964927915885893e-05, | |
| "loss": 1.9505, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.8682395219802856, | |
| "learning_rate": 4.960108651516545e-05, | |
| "loss": 1.7868, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 0.9288162589073181, | |
| "learning_rate": 4.954981892988451e-05, | |
| "loss": 2.1843, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 1.0266203880310059, | |
| "learning_rate": 4.949548281105951e-05, | |
| "loss": 2.3133, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 0.9135573506355286, | |
| "learning_rate": 4.943808495027631e-05, | |
| "loss": 2.1877, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 0.942903995513916, | |
| "learning_rate": 4.937763252181434e-05, | |
| "loss": 1.788, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.918860137462616, | |
| "learning_rate": 4.93141330817499e-05, | |
| "loss": 2.2997, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 1.0075480937957764, | |
| "learning_rate": 4.924759456701167e-05, | |
| "loss": 2.2172, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 0.8399456143379211, | |
| "learning_rate": 4.917802529438864e-05, | |
| "loss": 1.6652, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 0.8925999402999878, | |
| "learning_rate": 4.910543395949067e-05, | |
| "loss": 1.8684, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 0.8900786638259888, | |
| "learning_rate": 4.9029829635661475e-05, | |
| "loss": 2.1491, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.9098831415176392, | |
| "learning_rate": 4.895122177284465e-05, | |
| "loss": 1.9692, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 0.9300256371498108, | |
| "learning_rate": 4.8869620196402436e-05, | |
| "loss": 2.1417, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 0.8800130486488342, | |
| "learning_rate": 4.878503510588765e-05, | |
| "loss": 2.1419, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 0.869717538356781, | |
| "learning_rate": 4.8697477073768766e-05, | |
| "loss": 2.0039, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 1.0832667350769043, | |
| "learning_rate": 4.8606957044108556e-05, | |
| "loss": 1.9359, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.8442544341087341, | |
| "learning_rate": 4.851348633119606e-05, | |
| "loss": 1.771, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 0.8307965397834778, | |
| "learning_rate": 4.8417076618132426e-05, | |
| "loss": 1.7329, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 0.8500248789787292, | |
| "learning_rate": 4.8317739955370636e-05, | |
| "loss": 1.927, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 0.8779441714286804, | |
| "learning_rate": 4.821548875920927e-05, | |
| "loss": 1.8811, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 0.9491339325904846, | |
| "learning_rate": 4.811033581024056e-05, | |
| "loss": 2.0995, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.9209153056144714, | |
| "learning_rate": 4.800229425175294e-05, | |
| "loss": 2.0308, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 0.9706084132194519, | |
| "learning_rate": 4.7891377588088223e-05, | |
| "loss": 2.1603, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 0.8401190638542175, | |
| "learning_rate": 4.777759968295369e-05, | |
| "loss": 1.8921, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 1.7388620376586914, | |
| "learning_rate": 4.766097475768919e-05, | |
| "loss": 1.9469, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 0.8813150525093079, | |
| "learning_rate": 4.754151738948962e-05, | |
| "loss": 1.8235, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.8916892409324646, | |
| "learning_rate": 4.741924250958289e-05, | |
| "loss": 1.9513, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 0.8717350959777832, | |
| "learning_rate": 4.729416540136361e-05, | |
| "loss": 1.9231, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 0.9412696957588196, | |
| "learning_rate": 4.7166301698482815e-05, | |
| "loss": 2.099, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 0.9612340331077576, | |
| "learning_rate": 4.703566738289389e-05, | |
| "loss": 2.059, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 1.0139132738113403, | |
| "learning_rate": 4.69022787828549e-05, | |
| "loss": 1.6535, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.9509197473526001, | |
| "learning_rate": 4.676615257088776e-05, | |
| "loss": 2.1214, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 0.8892962336540222, | |
| "learning_rate": 4.662730576169423e-05, | |
| "loss": 2.1229, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 0.8373662829399109, | |
| "learning_rate": 4.6485755710029256e-05, | |
| "loss": 1.8558, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 0.8412055373191833, | |
| "learning_rate": 4.6341520108531746e-05, | |
| "loss": 1.7079, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 1.3762493133544922, | |
| "learning_rate": 4.619461698551315e-05, | |
| "loss": 1.8366, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.9610726237297058, | |
| "learning_rate": 4.604506470270403e-05, | |
| "loss": 2.1713, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 0.846570611000061, | |
| "learning_rate": 4.589288195295901e-05, | |
| "loss": 1.8648, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 0.9135278463363647, | |
| "learning_rate": 4.573808775792033e-05, | |
| "loss": 2.182, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 0.8865692615509033, | |
| "learning_rate": 4.5580701465640254e-05, | |
| "loss": 1.9423, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 0.8762995004653931, | |
| "learning_rate": 4.5420742748162734e-05, | |
| "loss": 1.9934, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.9263061881065369, | |
| "learning_rate": 4.525823159906459e-05, | |
| "loss": 2.2362, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 0.8523189425468445, | |
| "learning_rate": 4.509318833095642e-05, | |
| "loss": 2.1758, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 0.9841741919517517, | |
| "learning_rate": 4.492563357294369e-05, | |
| "loss": 2.0549, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 0.9290546178817749, | |
| "learning_rate": 4.475558826804833e-05, | |
| "loss": 1.8732, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 0.8647733926773071, | |
| "learning_rate": 4.458307367059092e-05, | |
| "loss": 2.0057, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.9161149859428406, | |
| "learning_rate": 4.440811134353412e-05, | |
| "loss": 1.7013, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 0.8401817083358765, | |
| "learning_rate": 4.42307231557875e-05, | |
| "loss": 2.0538, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 1.011531114578247, | |
| "learning_rate": 4.4050931279474015e-05, | |
| "loss": 1.998, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 0.8913917541503906, | |
| "learning_rate": 4.386875818715874e-05, | |
| "loss": 1.9491, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 0.9900378584861755, | |
| "learning_rate": 4.368422664903997e-05, | |
| "loss": 1.8276, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.8170567750930786, | |
| "learning_rate": 4.349735973010305e-05, | |
| "loss": 1.8383, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 0.8767079710960388, | |
| "learning_rate": 4.330818078723755e-05, | |
| "loss": 1.8861, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 0.9571582078933716, | |
| "learning_rate": 4.311671346631774e-05, | |
| "loss": 1.8612, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 0.9190142154693604, | |
| "learning_rate": 4.292298169924709e-05, | |
| "loss": 2.1069, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 0.9096384644508362, | |
| "learning_rate": 4.272700970096696e-05, | |
| "loss": 2.0704, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.929742693901062, | |
| "learning_rate": 4.252882196642992e-05, | |
| "loss": 1.8269, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 0.8830273747444153, | |
| "learning_rate": 4.23284432675381e-05, | |
| "loss": 1.8447, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 0.866507887840271, | |
| "learning_rate": 4.212589865004684e-05, | |
| "loss": 2.2203, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 0.860091507434845, | |
| "learning_rate": 4.192121343043424e-05, | |
| "loss": 2.0754, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 0.9852222204208374, | |
| "learning_rate": 4.1714413192736754e-05, | |
| "loss": 1.7543, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.8804312944412231, | |
| "learning_rate": 4.150552378535137e-05, | |
| "loss": 2.0036, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 0.7997055649757385, | |
| "learning_rate": 4.1294571317804854e-05, | |
| "loss": 1.6802, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 0.9458661079406738, | |
| "learning_rate": 4.108158215749014e-05, | |
| "loss": 2.1944, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 0.9290521144866943, | |
| "learning_rate": 4.0866582926370725e-05, | |
| "loss": 1.9413, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 0.9188751578330994, | |
| "learning_rate": 4.064960049765304e-05, | |
| "loss": 2.1782, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.9197388887405396, | |
| "learning_rate": 4.043066199242762e-05, | |
| "loss": 2.301, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 0.8998375535011292, | |
| "learning_rate": 4.020979477627907e-05, | |
| "loss": 1.8035, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 0.8178901672363281, | |
| "learning_rate": 3.998702645586565e-05, | |
| "loss": 1.8764, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 0.9451196789741516, | |
| "learning_rate": 3.976238487546864e-05, | |
| "loss": 1.8488, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 0.8145877718925476, | |
| "learning_rate": 3.953589811351204e-05, | |
| "loss": 1.8894, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.8082287311553955, | |
| "learning_rate": 3.930759447905298e-05, | |
| "loss": 1.7236, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 1.0476171970367432, | |
| "learning_rate": 3.907750250824327e-05, | |
| "loss": 2.1334, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 0.9171006679534912, | |
| "learning_rate": 3.884565096076269e-05, | |
| "loss": 1.7087, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 0.9148180484771729, | |
| "learning_rate": 3.861206881622419e-05, | |
| "loss": 2.1576, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 0.8305053114891052, | |
| "learning_rate": 3.837678527055168e-05, | |
| "loss": 1.5619, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.8816688060760498, | |
| "learning_rate": 3.813982973233083e-05, | |
| "loss": 1.7811, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 0.833396315574646, | |
| "learning_rate": 3.7901231819133105e-05, | |
| "loss": 1.9271, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 0.9301218390464783, | |
| "learning_rate": 3.766102135381393e-05, | |
| "loss": 2.1721, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 0.8964502215385437, | |
| "learning_rate": 3.741922836078499e-05, | |
| "loss": 1.7697, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 0.919535219669342, | |
| "learning_rate": 3.717588306226143e-05, | |
| "loss": 1.9075, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.9279409646987915, | |
| "learning_rate": 3.693101587448436e-05, | |
| "loss": 1.8996, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 0.8635279536247253, | |
| "learning_rate": 3.6684657403919005e-05, | |
| "loss": 1.8244, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 0.913209855556488, | |
| "learning_rate": 3.6436838443429175e-05, | |
| "loss": 2.0287, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 0.8230111002922058, | |
| "learning_rate": 3.618758996842839e-05, | |
| "loss": 1.8636, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 0.8902285099029541, | |
| "learning_rate": 3.5936943133008183e-05, | |
| "loss": 2.3189, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.8231277465820312, | |
| "learning_rate": 3.568492926604412e-05, | |
| "loss": 1.7345, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 0.82820063829422, | |
| "learning_rate": 3.5431579867279905e-05, | |
| "loss": 1.8564, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 0.8306258320808411, | |
| "learning_rate": 3.517692660339018e-05, | |
| "loss": 1.8671, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 0.8603615760803223, | |
| "learning_rate": 3.492100130402242e-05, | |
| "loss": 1.737, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 0.8179602026939392, | |
| "learning_rate": 3.4663835957818515e-05, | |
| "loss": 1.8263, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.8565822839736938, | |
| "learning_rate": 3.440546270841639e-05, | |
| "loss": 1.8216, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 0.9373151659965515, | |
| "learning_rate": 3.414591385043237e-05, | |
| "loss": 2.236, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 0.8852721452713013, | |
| "learning_rate": 3.3885221825424537e-05, | |
| "loss": 1.7058, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 0.8958799839019775, | |
| "learning_rate": 3.362341921783784e-05, | |
| "loss": 2.0029, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 0.9146959185600281, | |
| "learning_rate": 3.336053875093128e-05, | |
| "loss": 1.9745, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.8560377359390259, | |
| "learning_rate": 3.309661328268776e-05, | |
| "loss": 1.939, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 0.9292604327201843, | |
| "learning_rate": 3.283167580170712e-05, | |
| "loss": 2.1731, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 0.8372029066085815, | |
| "learning_rate": 3.256575942308278e-05, | |
| "loss": 1.7536, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 1.0057106018066406, | |
| "learning_rate": 3.229889738426264e-05, | |
| "loss": 2.0545, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 0.905636191368103, | |
| "learning_rate": 3.203112304089466e-05, | |
| "loss": 2.0688, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9915895462036133, | |
| "learning_rate": 3.176246986265767e-05, | |
| "loss": 1.7727, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 0.8286227583885193, | |
| "learning_rate": 3.149297142907792e-05, | |
| "loss": 1.9014, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 0.8335422873497009, | |
| "learning_rate": 3.122266142533191e-05, | |
| "loss": 1.8484, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 0.8963138461112976, | |
| "learning_rate": 3.095157363803598e-05, | |
| "loss": 2.042, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 0.9721676707267761, | |
| "learning_rate": 3.06797419510233e-05, | |
| "loss": 1.7216, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.927716076374054, | |
| "learning_rate": 3.0407200341108617e-05, | |
| "loss": 1.948, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 0.830467164516449, | |
| "learning_rate": 3.013398287384144e-05, | |
| "loss": 1.8576, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 0.9040769338607788, | |
| "learning_rate": 2.986012369924811e-05, | |
| "loss": 1.8908, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 0.8086222410202026, | |
| "learning_rate": 2.9585657047563315e-05, | |
| "loss": 1.868, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 0.8116671442985535, | |
| "learning_rate": 2.931061722495159e-05, | |
| "loss": 1.7886, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.9567749500274658, | |
| "learning_rate": 2.9035038609219306e-05, | |
| "loss": 1.853, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 0.9087838530540466, | |
| "learning_rate": 2.875895564551772e-05, | |
| "loss": 1.9346, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 1.079892873764038, | |
| "learning_rate": 2.8482402842037614e-05, | |
| "loss": 1.5922, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 0.8078391551971436, | |
| "learning_rate": 2.8205414765696003e-05, | |
| "loss": 1.6757, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 0.9625094532966614, | |
| "learning_rate": 2.792802603781562e-05, | |
| "loss": 2.0973, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.8933418393135071, | |
| "learning_rate": 2.7650271329797427e-05, | |
| "loss": 1.9035, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 0.972737193107605, | |
| "learning_rate": 2.737218535878705e-05, | |
| "loss": 1.694, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 0.8630124926567078, | |
| "learning_rate": 2.7093802883335357e-05, | |
| "loss": 1.9106, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 1.0283786058425903, | |
| "learning_rate": 2.6815158699053932e-05, | |
| "loss": 2.0379, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 0.8262351751327515, | |
| "learning_rate": 2.6536287634265918e-05, | |
| "loss": 1.7445, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.940129816532135, | |
| "learning_rate": 2.6257224545652688e-05, | |
| "loss": 2.1189, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 0.9087008833885193, | |
| "learning_rate": 2.5978004313897104e-05, | |
| "loss": 1.9498, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 0.8524187207221985, | |
| "learning_rate": 2.569866183932368e-05, | |
| "loss": 1.9588, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 0.8133301734924316, | |
| "learning_rate": 2.5419232037536316e-05, | |
| "loss": 1.8297, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 0.8447262644767761, | |
| "learning_rate": 2.5139749835054123e-05, | |
| "loss": 1.7685, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.8986214995384216, | |
| "learning_rate": 2.4860250164945876e-05, | |
| "loss": 1.7743, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 0.9351289868354797, | |
| "learning_rate": 2.4580767962463687e-05, | |
| "loss": 1.8997, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 0.9037435054779053, | |
| "learning_rate": 2.4301338160676324e-05, | |
| "loss": 1.9691, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 0.849233865737915, | |
| "learning_rate": 2.40219956861029e-05, | |
| "loss": 1.8385, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 0.9789981245994568, | |
| "learning_rate": 2.374277545434732e-05, | |
| "loss": 1.8535, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.8241089582443237, | |
| "learning_rate": 2.346371236573409e-05, | |
| "loss": 1.6937, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 0.8240427374839783, | |
| "learning_rate": 2.318484130094607e-05, | |
| "loss": 1.8915, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 0.9362565875053406, | |
| "learning_rate": 2.2906197116664653e-05, | |
| "loss": 1.8763, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 0.880988359451294, | |
| "learning_rate": 2.262781464121296e-05, | |
| "loss": 2.0785, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 0.8171457648277283, | |
| "learning_rate": 2.2349728670202582e-05, | |
| "loss": 1.8087, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.9022923111915588, | |
| "learning_rate": 2.2071973962184384e-05, | |
| "loss": 1.9251, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 0.8642033338546753, | |
| "learning_rate": 2.1794585234303993e-05, | |
| "loss": 1.7433, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 0.9066776633262634, | |
| "learning_rate": 2.1517597157962392e-05, | |
| "loss": 1.7444, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 1.0174524784088135, | |
| "learning_rate": 2.124104435448228e-05, | |
| "loss": 1.9772, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 0.9552676677703857, | |
| "learning_rate": 2.0964961390780703e-05, | |
| "loss": 1.8762, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.9591497778892517, | |
| "learning_rate": 2.0689382775048418e-05, | |
| "loss": 1.9189, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 0.858933687210083, | |
| "learning_rate": 2.0414342952436694e-05, | |
| "loss": 1.8193, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 0.8183876872062683, | |
| "learning_rate": 2.0139876300751904e-05, | |
| "loss": 1.6413, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 0.8995866775512695, | |
| "learning_rate": 1.9866017126158574e-05, | |
| "loss": 2.0298, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 0.8391925692558289, | |
| "learning_rate": 1.9592799658891385e-05, | |
| "loss": 1.8856, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.8501824736595154, | |
| "learning_rate": 1.9320258048976702e-05, | |
| "loss": 1.8058, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 0.8973059058189392, | |
| "learning_rate": 1.904842636196402e-05, | |
| "loss": 2.0181, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 0.8554884195327759, | |
| "learning_rate": 1.8777338574668095e-05, | |
| "loss": 1.8791, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 0.9751168489456177, | |
| "learning_rate": 1.850702857092208e-05, | |
| "loss": 2.1171, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 0.8239455223083496, | |
| "learning_rate": 1.8237530137342335e-05, | |
| "loss": 1.9513, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.8867501020431519, | |
| "learning_rate": 1.796887695910535e-05, | |
| "loss": 2.1136, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 0.8486935496330261, | |
| "learning_rate": 1.7701102615737368e-05, | |
| "loss": 1.9146, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 0.8988845348358154, | |
| "learning_rate": 1.7434240576917226e-05, | |
| "loss": 1.997, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 0.8450900912284851, | |
| "learning_rate": 1.7168324198292888e-05, | |
| "loss": 1.8575, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 0.9029667377471924, | |
| "learning_rate": 1.6903386717312236e-05, | |
| "loss": 1.9964, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.8760489821434021, | |
| "learning_rate": 1.6639461249068726e-05, | |
| "loss": 2.1632, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 0.8516858816146851, | |
| "learning_rate": 1.637658078216217e-05, | |
| "loss": 1.8699, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 0.8790733814239502, | |
| "learning_rate": 1.6114778174575473e-05, | |
| "loss": 1.6239, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 0.9619203805923462, | |
| "learning_rate": 1.585408614956763e-05, | |
| "loss": 2.3154, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 0.8987338542938232, | |
| "learning_rate": 1.559453729158361e-05, | |
| "loss": 1.7751, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.8577209711074829, | |
| "learning_rate": 1.5336164042181494e-05, | |
| "loss": 1.8087, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 0.8519135117530823, | |
| "learning_rate": 1.5078998695977586e-05, | |
| "loss": 1.8299, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 0.9367295503616333, | |
| "learning_rate": 1.482307339660983e-05, | |
| "loss": 2.177, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 0.9398857951164246, | |
| "learning_rate": 1.4568420132720106e-05, | |
| "loss": 2.1118, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 0.8866623640060425, | |
| "learning_rate": 1.4315070733955888e-05, | |
| "loss": 2.0044, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.9234758615493774, | |
| "learning_rate": 1.4063056866991826e-05, | |
| "loss": 1.9386, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 0.8017230033874512, | |
| "learning_rate": 1.381241003157162e-05, | |
| "loss": 1.6357, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 0.8828235864639282, | |
| "learning_rate": 1.3563161556570826e-05, | |
| "loss": 1.9193, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 0.8700492978096008, | |
| "learning_rate": 1.3315342596080996e-05, | |
| "loss": 1.6987, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 0.8625661134719849, | |
| "learning_rate": 1.3068984125515644e-05, | |
| "loss": 1.9628, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.8373908400535583, | |
| "learning_rate": 1.2824116937738579e-05, | |
| "loss": 1.664, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 0.8767737150192261, | |
| "learning_rate": 1.2580771639215027e-05, | |
| "loss": 2.0254, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 0.8676634430885315, | |
| "learning_rate": 1.2338978646186084e-05, | |
| "loss": 1.7836, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 0.8328086733818054, | |
| "learning_rate": 1.2098768180866895e-05, | |
| "loss": 1.7046, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 0.8744992613792419, | |
| "learning_rate": 1.1860170267669174e-05, | |
| "loss": 1.8117, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7836005687713623, | |
| "learning_rate": 1.1623214729448317e-05, | |
| "loss": 1.7427, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 0.7778053283691406, | |
| "learning_rate": 1.1387931183775822e-05, | |
| "loss": 1.6208, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 0.895233690738678, | |
| "learning_rate": 1.1154349039237322e-05, | |
| "loss": 1.834, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 0.864975094795227, | |
| "learning_rate": 1.0922497491756734e-05, | |
| "loss": 1.7659, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 0.9300910830497742, | |
| "learning_rate": 1.0692405520947028e-05, | |
| "loss": 2.0783, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.9012324810028076, | |
| "learning_rate": 1.0464101886487958e-05, | |
| "loss": 1.9243, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 0.8135029673576355, | |
| "learning_rate": 1.0237615124531363e-05, | |
| "loss": 1.7498, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 0.8468223214149475, | |
| "learning_rate": 1.0012973544134358e-05, | |
| "loss": 1.8409, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 0.8117995262145996, | |
| "learning_rate": 9.79020522372093e-06, | |
| "loss": 1.7893, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 0.8934926986694336, | |
| "learning_rate": 9.569338007572382e-06, | |
| "loss": 1.9382, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.8657869100570679, | |
| "learning_rate": 9.35039950234696e-06, | |
| "loss": 1.7477, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 0.871850311756134, | |
| "learning_rate": 9.133417073629289e-06, | |
| "loss": 1.9922, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 0.8558037877082825, | |
| "learning_rate": 8.918417842509867e-06, | |
| "loss": 1.8059, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 0.8291991353034973, | |
| "learning_rate": 8.705428682195155e-06, | |
| "loss": 1.8627, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 0.9084640741348267, | |
| "learning_rate": 8.494476214648626e-06, | |
| "loss": 1.9113, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.8608958125114441, | |
| "learning_rate": 8.285586807263254e-06, | |
| "loss": 1.937, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 0.8367571830749512, | |
| "learning_rate": 8.078786569565763e-06, | |
| "loss": 1.7382, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 0.9098989367485046, | |
| "learning_rate": 7.874101349953167e-06, | |
| "loss": 2.0156, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 0.8325051069259644, | |
| "learning_rate": 7.671556732461905e-06, | |
| "loss": 1.6789, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 0.855073869228363, | |
| "learning_rate": 7.471178033570081e-06, | |
| "loss": 2.0656, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.8020650148391724, | |
| "learning_rate": 7.272990299033045e-06, | |
| "loss": 1.6928, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 0.980696439743042, | |
| "learning_rate": 7.077018300752916e-06, | |
| "loss": 1.8311, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 0.8271718621253967, | |
| "learning_rate": 6.883286533682265e-06, | |
| "loss": 1.9569, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 0.9465168118476868, | |
| "learning_rate": 6.691819212762454e-06, | |
| "loss": 2.1162, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 0.878783643245697, | |
| "learning_rate": 6.502640269896953e-06, | |
| "loss": 1.7634, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.9123842120170593, | |
| "learning_rate": 6.3157733509600355e-06, | |
| "loss": 2.2862, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 0.8306463956832886, | |
| "learning_rate": 6.1312418128412565e-06, | |
| "loss": 1.7149, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 0.8042676448822021, | |
| "learning_rate": 5.949068720525991e-06, | |
| "loss": 1.5697, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 0.8570838570594788, | |
| "learning_rate": 5.769276844212501e-06, | |
| "loss": 1.9575, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 0.853945791721344, | |
| "learning_rate": 5.591888656465874e-06, | |
| "loss": 1.6021, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.7824327945709229, | |
| "learning_rate": 5.416926329409083e-06, | |
| "loss": 1.7469, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 0.9162058234214783, | |
| "learning_rate": 5.244411731951671e-06, | |
| "loss": 1.9214, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 0.8960239887237549, | |
| "learning_rate": 5.074366427056309e-06, | |
| "loss": 1.8902, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 0.8540034294128418, | |
| "learning_rate": 4.90681166904359e-06, | |
| "loss": 1.7701, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 0.8078952431678772, | |
| "learning_rate": 4.741768400935417e-06, | |
| "loss": 1.7476, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.8801387548446655, | |
| "learning_rate": 4.579257251837271e-06, | |
| "loss": 1.9394, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 0.9091523885726929, | |
| "learning_rate": 4.419298534359759e-06, | |
| "loss": 1.9172, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 0.86611407995224, | |
| "learning_rate": 4.261912242079674e-06, | |
| "loss": 1.7744, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 0.8813964128494263, | |
| "learning_rate": 4.107118047040995e-06, | |
| "loss": 1.964, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 1.0881954431533813, | |
| "learning_rate": 3.954935297295975e-06, | |
| "loss": 1.7736, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.8743950128555298, | |
| "learning_rate": 3.8053830144868547e-06, | |
| "loss": 1.9119, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 0.8601691126823425, | |
| "learning_rate": 3.6584798914682582e-06, | |
| "loss": 1.8711, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 0.884376585483551, | |
| "learning_rate": 3.514244289970753e-06, | |
| "loss": 2.1294, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 0.9268573522567749, | |
| "learning_rate": 3.3726942383057763e-06, | |
| "loss": 2.061, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 0.7765485644340515, | |
| "learning_rate": 3.233847429112244e-06, | |
| "loss": 1.5819, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.8231096267700195, | |
| "learning_rate": 3.0977212171451e-06, | |
| "loss": 1.8227, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 0.8997424244880676, | |
| "learning_rate": 2.9643326171061165e-06, | |
| "loss": 1.6759, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 0.8399364948272705, | |
| "learning_rate": 2.833698301517185e-06, | |
| "loss": 1.6793, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 0.920159101486206, | |
| "learning_rate": 2.7058345986363974e-06, | |
| "loss": 2.0572, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 0.9057918787002563, | |
| "learning_rate": 2.5807574904171155e-06, | |
| "loss": 2.2139, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.8899713158607483, | |
| "learning_rate": 2.4584826105103764e-06, | |
| "loss": 1.9027, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 1.0192358493804932, | |
| "learning_rate": 2.3390252423108076e-06, | |
| "loss": 1.8376, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 0.8623155951499939, | |
| "learning_rate": 2.222400317046308e-06, | |
| "loss": 1.4057, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 0.9236856698989868, | |
| "learning_rate": 2.108622411911773e-06, | |
| "loss": 1.9201, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 0.9156610369682312, | |
| "learning_rate": 1.997705748247067e-06, | |
| "loss": 2.0085, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 1.2603415250778198, | |
| "learning_rate": 1.8896641897594492e-06, | |
| "loss": 1.7741, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 0.8280951380729675, | |
| "learning_rate": 1.78451124079074e-06, | |
| "loss": 1.8905, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 0.8544325828552246, | |
| "learning_rate": 1.6822600446293636e-06, | |
| "loss": 1.9967, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 0.9109524488449097, | |
| "learning_rate": 1.5829233818675766e-06, | |
| "loss": 1.73, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 0.8589962720870972, | |
| "learning_rate": 1.486513668803946e-06, | |
| "loss": 1.827, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.8872967958450317, | |
| "learning_rate": 1.3930429558914494e-06, | |
| "loss": 1.6534, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 0.8933804631233215, | |
| "learning_rate": 1.3025229262312366e-06, | |
| "loss": 1.9079, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 0.8584591746330261, | |
| "learning_rate": 1.214964894112361e-06, | |
| "loss": 1.8498, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 0.8858172297477722, | |
| "learning_rate": 1.1303798035975643e-06, | |
| "loss": 1.7681, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 0.8581066131591797, | |
| "learning_rate": 1.0487782271553504e-06, | |
| "loss": 1.9958, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.8548694849014282, | |
| "learning_rate": 9.701703643385295e-07, | |
| "loss": 1.8107, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 0.8163594603538513, | |
| "learning_rate": 8.94566040509337e-07, | |
| "loss": 1.8072, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 0.9220604300498962, | |
| "learning_rate": 8.219747056113586e-07, | |
| "loss": 2.109, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 0.8807504773139954, | |
| "learning_rate": 7.524054329883346e-07, | |
| "loss": 1.6955, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 0.8602936863899231, | |
| "learning_rate": 6.858669182500971e-07, | |
| "loss": 1.849, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.8880540132522583, | |
| "learning_rate": 6.223674781856592e-07, | |
| "loss": 2.0819, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 0.915898859500885, | |
| "learning_rate": 5.619150497236992e-07, | |
| "loss": 2.2103, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 0.9282847046852112, | |
| "learning_rate": 5.045171889404954e-07, | |
| "loss": 2.1302, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 0.801628053188324, | |
| "learning_rate": 4.501810701154907e-07, | |
| "loss": 1.8129, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 0.8396210670471191, | |
| "learning_rate": 3.98913484834551e-07, | |
| "loss": 1.3778, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.8347736597061157, | |
| "learning_rate": 3.507208411410778e-07, | |
| "loss": 1.7383, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9632, | |
| "grad_norm": 0.876707136631012, | |
| "learning_rate": 3.0560916273504325e-07, | |
| "loss": 1.8849, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 0.9329042434692383, | |
| "learning_rate": 2.635840882200924e-07, | |
| "loss": 1.8339, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.9696, | |
| "grad_norm": 0.8642157912254333, | |
| "learning_rate": 2.246508703987543e-07, | |
| "loss": 1.916, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 0.9208475351333618, | |
| "learning_rate": 1.8881437561586722e-07, | |
| "loss": 1.7845, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.8702243566513062, | |
| "learning_rate": 1.5607908315035667e-07, | |
| "loss": 2.0131, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 0.879602313041687, | |
| "learning_rate": 1.264490846553279e-07, | |
| "loss": 1.9444, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.9824, | |
| "grad_norm": 0.8321172595024109, | |
| "learning_rate": 9.992808364666373e-08, | |
| "loss": 1.7554, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 0.9140818119049072, | |
| "learning_rate": 7.651939504010885e-08, | |
| "loss": 1.9615, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.9888, | |
| "grad_norm": 0.8076279163360596, | |
| "learning_rate": 5.622594473692067e-08, | |
| "loss": 1.6552, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.8180491328239441, | |
| "learning_rate": 3.90502692581729e-08, | |
| "loss": 1.8337, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9952, | |
| "grad_norm": 0.919771671295166, | |
| "learning_rate": 2.4994515427695374e-08, | |
| "loss": 1.7055, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 0.8689696192741394, | |
| "learning_rate": 1.4060440103746964e-08, | |
| "loss": 1.8763, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3404301404953003, | |
| "learning_rate": 6.249409959421803e-09, | |
| "loss": 1.8715, | |
| "step": 313 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 313, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.23639789568e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |