{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0006856359273226, "eval_steps": 500, "global_step": 1459, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006856359273225917, "grad_norm": 0.15940325862016855, "kl": 0.0, "learning_rate": 0.0, "logits/chosen": 90584446.08955224, "logits/rejected": 142331299.67213115, "logps/chosen": -228.17910447761193, "logps/rejected": -270.6885245901639, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0013712718546451835, "grad_norm": 0.17525783103107823, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "logits/chosen": 131802825.6969697, "logits/rejected": 129888123.87096775, "logps/chosen": -220.12121212121212, "logps/rejected": -361.2903225806452, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.002056907781967775, "grad_norm": 0.18730848898715646, "kl": 0.072265625, "learning_rate": 1.0000000000000001e-07, "logits/chosen": 114603188.70588236, "logits/rejected": 167911970.13333333, "logps/chosen": -260.47058823529414, "logps/rejected": -352.0, "loss": 0.5048, "rewards/chosen": -0.01736270680147059, "rewards/margins": -0.03904239430147059, "rewards/rejected": 0.0216796875, "step": 3 }, { "epoch": 0.002742543709290367, "grad_norm": 0.15617459048181181, "kl": 0.0546875, "learning_rate": 1.5000000000000002e-07, "logits/chosen": 108113704.42105263, "logits/rejected": 129055507.6923077, "logps/chosen": -232.8421052631579, "logps/rejected": -260.7692307692308, "loss": 0.4998, "rewards/chosen": -0.012473658511513159, "rewards/margins": 0.0071796618009868415, "rewards/rejected": -0.0196533203125, "step": 4 }, { "epoch": 0.0034281796366129585, "grad_norm": 0.17135161866154622, "kl": 0.06640625, "learning_rate": 2.0000000000000002e-07, "logits/chosen": 175767552.0, "logits/rejected": 89849856.0, "logps/chosen": -254.75, "logps/rejected": -330.0, "loss": 0.5003, "rewards/chosen": 0.0045623779296875, "rewards/margins": -0.0071659088134765625, "rewards/rejected": 0.011728286743164062, "step": 5 }, { "epoch": 0.00411381556393555, "grad_norm": 0.17554526313955254, "kl": 0.2216796875, "learning_rate": 2.5000000000000004e-07, "logits/chosen": 160269672.56338027, "logits/rejected": 166300474.3859649, "logps/chosen": -333.07042253521126, "logps/rejected": -304.56140350877195, "loss": 0.4987, "rewards/chosen": 0.025269414337588027, "rewards/margins": 0.011494743284956449, "rewards/rejected": 0.013774671052631578, "step": 6 }, { "epoch": 0.0047994514912581415, "grad_norm": 0.17289882537355702, "kl": 0.046875, "learning_rate": 3.0000000000000004e-07, "logits/chosen": 123092158.91525424, "logits/rejected": 128017452.52173913, "logps/chosen": -228.61016949152543, "logps/rejected": -281.27536231884056, "loss": 0.4988, "rewards/chosen": 0.006000066207627119, "rewards/margins": 0.009828474269221323, "rewards/rejected": -0.003828408061594203, "step": 7 }, { "epoch": 0.005485087418580734, "grad_norm": 0.15664295658835925, "kl": 0.080078125, "learning_rate": 3.5000000000000004e-07, "logits/chosen": 142748515.79661018, "logits/rejected": 120419075.71014492, "logps/chosen": -277.96610169491527, "logps/rejected": -246.2608695652174, "loss": 0.5028, "rewards/chosen": -0.006264896716101695, "rewards/margins": -0.021840360937116186, "rewards/rejected": 0.015575464221014492, "step": 8 }, { "epoch": 0.0061707233459033254, "grad_norm": 0.18615871770424294, "kl": 0.12109375, "learning_rate": 4.0000000000000003e-07, "logits/chosen": 144813864.42105263, "logits/rejected": 125710970.59154929, "logps/chosen": -225.68421052631578, "logps/rejected": -361.46478873239437, "loss": 0.4983, "rewards/chosen": 0.0032980400219298246, "rewards/margins": 0.011330610444465036, "rewards/rejected": -0.008032570422535211, "step": 9 }, { "epoch": 0.006856359273225917, "grad_norm": 0.17508138524501887, "kl": 0.0546875, "learning_rate": 4.5000000000000003e-07, "logits/chosen": 109189422.16393442, "logits/rejected": 151495756.41791046, "logps/chosen": -219.80327868852459, "logps/rejected": -353.910447761194, "loss": 0.4981, "rewards/chosen": 0.005911404969262295, "rewards/margins": 0.016675455808814535, "rewards/rejected": -0.01076405083955224, "step": 10 }, { "epoch": 0.0075419952005485085, "grad_norm": 0.16403092844224004, "kl": 0.0078125, "learning_rate": 5.000000000000001e-07, "logits/chosen": 145358848.0, "logits/rejected": 124682240.0, "logps/chosen": -239.625, "logps/rejected": -251.25, "loss": 0.4989, "rewards/chosen": -0.00519561767578125, "rewards/margins": 0.01129913330078125, "rewards/rejected": -0.0164947509765625, "step": 11 }, { "epoch": 0.0082276311278711, "grad_norm": 0.16991910083173625, "kl": 0.052734375, "learning_rate": 5.5e-07, "logits/chosen": 111657456.48484848, "logits/rejected": 115005109.67741935, "logps/chosen": -272.24242424242425, "logps/rejected": -293.6774193548387, "loss": 0.498, "rewards/chosen": -0.0007990056818181819, "rewards/margins": 0.015958098022818917, "rewards/rejected": -0.016757103704637098, "step": 12 }, { "epoch": 0.008913267055193692, "grad_norm": 0.17690330425618253, "kl": 0.091796875, "learning_rate": 6.000000000000001e-07, "logits/chosen": 124113268.36363636, "logits/rejected": 151130244.12903225, "logps/chosen": -222.54545454545453, "logps/rejected": -334.4516129032258, "loss": 0.502, "rewards/chosen": -0.007331616950757576, "rewards/margins": -0.017128743966886607, "rewards/rejected": 0.009797127016129033, "step": 13 }, { "epoch": 0.009598902982516283, "grad_norm": 0.15134049365985444, "kl": 0.125, "learning_rate": 6.5e-07, "logits/chosen": 153373802.98507464, "logits/rejected": 74844260.72131148, "logps/chosen": -234.7462686567164, "logps/rejected": -268.327868852459, "loss": 0.5031, "rewards/chosen": -0.02418085354477612, "rewards/margins": -0.019003871603177756, "rewards/rejected": -0.005176981941598361, "step": 14 }, { "epoch": 0.010284538909838875, "grad_norm": 0.16745051806629846, "kl": 0.037109375, "learning_rate": 7.000000000000001e-07, "logits/chosen": 157380302.3283582, "logits/rejected": 103551177.44262294, "logps/chosen": -261.25373134328356, "logps/rejected": -344.655737704918, "loss": 0.4989, "rewards/chosen": 0.013642723880597014, "rewards/margins": 0.008615828388793736, "rewards/rejected": 0.005026895491803279, "step": 15 }, { "epoch": 0.010970174837161468, "grad_norm": 0.15651598930743085, "kl": 0.044921875, "learning_rate": 7.5e-07, "logits/chosen": 155538773.33333334, "logits/rejected": 86882011.42857143, "logps/chosen": -251.77777777777777, "logps/rejected": -297.14285714285717, "loss": 0.4988, "rewards/chosen": 0.009006076388888888, "rewards/margins": 0.009006076388888888, "rewards/rejected": 0.0, "step": 16 }, { "epoch": 0.011655810764484058, "grad_norm": 0.17340064437348254, "kl": 0.076171875, "learning_rate": 8.000000000000001e-07, "logits/chosen": 115949568.0, "logits/rejected": 125304832.0, "logps/chosen": -248.5, "logps/rejected": -368.5, "loss": 0.5028, "rewards/chosen": 0.003894805908203125, "rewards/margins": -0.028209686279296875, "rewards/rejected": 0.0321044921875, "step": 17 }, { "epoch": 0.012341446691806651, "grad_norm": 0.149311779082114, "kl": 0.0546875, "learning_rate": 8.500000000000001e-07, "logits/chosen": 89115342.12987013, "logits/rejected": 125294551.84313725, "logps/chosen": -190.85714285714286, "logps/rejected": -273.88235294117646, "loss": 0.4979, "rewards/chosen": 0.00020292207792207794, "rewards/margins": 0.01691937428380443, "rewards/rejected": -0.016716452205882353, "step": 18 }, { "epoch": 0.013027082619129242, "grad_norm": 0.16704560860414172, "kl": 0.048828125, "learning_rate": 9.000000000000001e-07, "logits/chosen": 152492909.7142857, "logits/rejected": 96235975.1111111, "logps/chosen": -227.42857142857142, "logps/rejected": -284.22222222222223, "loss": 0.5019, "rewards/chosen": -0.0015694754464285715, "rewards/margins": -0.01188780769469246, "rewards/rejected": 0.010318332248263888, "step": 19 }, { "epoch": 0.013712718546451834, "grad_norm": 0.17506787160049783, "kl": 0.05078125, "learning_rate": 9.500000000000001e-07, "logits/chosen": 148897792.0, "logits/rejected": 133477556.70588236, "logps/chosen": -316.26666666666665, "logps/rejected": -293.88235294117646, "loss": 0.5016, "rewards/chosen": 0.009554036458333333, "rewards/margins": -0.010235715379901962, "rewards/rejected": 0.019789751838235295, "step": 20 }, { "epoch": 0.014398354473774426, "grad_norm": 0.17236014098291852, "kl": 0.099609375, "learning_rate": 1.0000000000000002e-06, "logits/chosen": 143489347.36842105, "logits/rejected": 99304577.8028169, "logps/chosen": -242.80701754385964, "logps/rejected": -285.2957746478873, "loss": 0.5005, "rewards/chosen": -0.010099712171052632, "rewards/margins": -0.005828970533728689, "rewards/rejected": -0.004270741637323943, "step": 21 }, { "epoch": 0.015083990401097017, "grad_norm": 0.15529752711693107, "kl": 0.001953125, "learning_rate": 1.0500000000000001e-06, "logits/chosen": 132281895.38461539, "logits/rejected": 98383059.3015873, "logps/chosen": -272.0, "logps/rejected": -278.0952380952381, "loss": 0.5023, "rewards/chosen": 0.0016263521634615385, "rewards/margins": -0.01852106227106227, "rewards/rejected": 0.020147414434523808, "step": 22 }, { "epoch": 0.01576962632841961, "grad_norm": 0.17033726085237044, "kl": 0.107421875, "learning_rate": 1.1e-06, "logits/chosen": 138483121.8983051, "logits/rejected": 114978637.91304348, "logps/chosen": -239.72881355932202, "logps/rejected": -273.6231884057971, "loss": 0.5006, "rewards/chosen": -0.0008275953389830508, "rewards/margins": -0.008852391534635224, "rewards/rejected": 0.008024796195652174, "step": 23 }, { "epoch": 0.0164552622557422, "grad_norm": 0.16562177544441756, "kl": 0.125, "learning_rate": 1.1500000000000002e-06, "logits/chosen": 148559541.67741936, "logits/rejected": 75028790.3030303, "logps/chosen": -224.51612903225808, "logps/rejected": -271.75757575757575, "loss": 0.5001, "rewards/chosen": -0.014695690524193549, "rewards/margins": -0.009126694903928397, "rewards/rejected": -0.005568995620265152, "step": 24 }, { "epoch": 0.017140898183064794, "grad_norm": 0.16531593441146036, "kl": 0.0, "learning_rate": 1.2000000000000002e-06, "logits/chosen": 106887101.93548387, "logits/rejected": 80994552.24242425, "logps/chosen": -250.06451612903226, "logps/rejected": -329.6969696969697, "loss": 0.4976, "rewards/chosen": 0.012632308467741936, "rewards/margins": 0.02040041926319648, "rewards/rejected": -0.007768110795454545, "step": 25 }, { "epoch": 0.017826534110387385, "grad_norm": 0.17539307553523784, "kl": 0.068359375, "learning_rate": 1.25e-06, "logits/chosen": 88779434.66666667, "logits/rejected": 143147536.51612905, "logps/chosen": -238.3030303030303, "logps/rejected": -301.93548387096774, "loss": 0.5025, "rewards/chosen": -0.008618903882575758, "rewards/margins": -0.018195516785801565, "rewards/rejected": 0.009576612903225807, "step": 26 }, { "epoch": 0.018512170037709975, "grad_norm": 0.18591212481007463, "kl": 0.146484375, "learning_rate": 1.3e-06, "logits/chosen": 144359179.46268657, "logits/rejected": 121978611.40983607, "logps/chosen": -260.05970149253733, "logps/rejected": -311.8688524590164, "loss": 0.5001, "rewards/chosen": -0.0012534981343283581, "rewards/margins": -0.001125424363836555, "rewards/rejected": -0.00012807377049180329, "step": 27 }, { "epoch": 0.019197805965032566, "grad_norm": 0.13319048287580218, "kl": 0.013671875, "learning_rate": 1.3500000000000002e-06, "logits/chosen": 166534496.52459016, "logits/rejected": 28350677.970149253, "logps/chosen": -227.40983606557376, "logps/rejected": -222.32835820895522, "loss": 0.5006, "rewards/chosen": 0.0036921266649590162, "rewards/margins": -0.0022692772529514316, "rewards/rejected": 0.005961403917910448, "step": 28 }, { "epoch": 0.01988344189235516, "grad_norm": 0.18236938076974799, "kl": 0.0390625, "learning_rate": 1.4000000000000001e-06, "logits/chosen": 245771497.54385966, "logits/rejected": 85481097.0140845, "logps/chosen": -372.49122807017545, "logps/rejected": -262.76056338028167, "loss": 0.4967, "rewards/chosen": 0.00424890350877193, "rewards/margins": 0.022487239776377562, "rewards/rejected": -0.018238336267605633, "step": 29 }, { "epoch": 0.02056907781967775, "grad_norm": 0.18494735053361253, "kl": 0.197265625, "learning_rate": 1.45e-06, "logits/chosen": 172678704.3018868, "logits/rejected": 119174157.65333334, "logps/chosen": -287.0943396226415, "logps/rejected": -334.08, "loss": 0.4995, "rewards/chosen": -0.007434772995283019, "rewards/margins": -0.0041274813286163526, "rewards/rejected": -0.0033072916666666667, "step": 30 }, { "epoch": 0.02125471374700034, "grad_norm": 0.19938118016306147, "kl": 0.0, "learning_rate": 1.5e-06, "logits/chosen": 111326780.7457627, "logits/rejected": 129521931.13043478, "logps/chosen": -280.6779661016949, "logps/rejected": -323.2463768115942, "loss": 0.4973, "rewards/chosen": 0.010493908898305085, "rewards/margins": 0.023383826244319575, "rewards/rejected": -0.012889917346014492, "step": 31 }, { "epoch": 0.021940349674322936, "grad_norm": 0.17263126776584703, "kl": 0.0, "learning_rate": 1.5500000000000002e-06, "logits/chosen": 105291493.51724137, "logits/rejected": 123911723.88571429, "logps/chosen": -267.58620689655174, "logps/rejected": -265.14285714285717, "loss": 0.4988, "rewards/chosen": -0.014791554418103448, "rewards/margins": 0.006929985760467982, "rewards/rejected": -0.02172154017857143, "step": 32 }, { "epoch": 0.022625985601645526, "grad_norm": 0.20196193238514562, "kl": 0.0380859375, "learning_rate": 1.6000000000000001e-06, "logits/chosen": 155262832.28070176, "logits/rejected": 80341597.74647887, "logps/chosen": -226.80701754385964, "logps/rejected": -218.81690140845072, "loss": 0.496, "rewards/chosen": 0.0208740234375, "rewards/margins": 0.03299165107834507, "rewards/rejected": -0.01211762764084507, "step": 33 }, { "epoch": 0.023311621528968117, "grad_norm": 0.166720179600812, "kl": 0.03125, "learning_rate": 1.6500000000000003e-06, "logits/chosen": 178702770.42424244, "logits/rejected": 90109885.93548387, "logps/chosen": -239.63636363636363, "logps/rejected": -280.7741935483871, "loss": 0.4995, "rewards/chosen": -0.011304450757575758, "rewards/margins": 0.0037653262081500487, "rewards/rejected": -0.015069776965725807, "step": 34 }, { "epoch": 0.02399725745629071, "grad_norm": 0.17112418564624912, "kl": 0.05859375, "learning_rate": 1.7000000000000002e-06, "logits/chosen": 138006131.61290324, "logits/rejected": 73479757.57575758, "logps/chosen": -230.83870967741936, "logps/rejected": -287.030303030303, "loss": 0.4979, "rewards/chosen": 0.007367534022177419, "rewards/margins": 0.008980341787328934, "rewards/rejected": -0.0016128077651515152, "step": 35 }, { "epoch": 0.024682893383613302, "grad_norm": 0.20469991286359196, "kl": 0.109375, "learning_rate": 1.75e-06, "logits/chosen": 124595501.1764706, "logits/rejected": 120935765.33333333, "logps/chosen": -309.1764705882353, "logps/rejected": -322.1333333333333, "loss": 0.4995, "rewards/chosen": 0.004836138556985294, "rewards/margins": 0.0020692114736519606, "rewards/rejected": 0.0027669270833333335, "step": 36 }, { "epoch": 0.025368529310935892, "grad_norm": 0.18454171198615588, "kl": 0.0546875, "learning_rate": 1.8000000000000001e-06, "logits/chosen": 146250567.3442623, "logits/rejected": 135219352.83582088, "logps/chosen": -252.59016393442624, "logps/rejected": -263.1641791044776, "loss": 0.4955, "rewards/chosen": 0.015993212090163935, "rewards/margins": 0.03653017570956692, "rewards/rejected": -0.020536963619402986, "step": 37 }, { "epoch": 0.026054165238258483, "grad_norm": 0.19453753075245167, "kl": 0.16796875, "learning_rate": 1.85e-06, "logits/chosen": 149841510.4, "logits/rejected": 120666899.6923077, "logps/chosen": -252.96, "logps/rejected": -326.97435897435895, "loss": 0.4996, "rewards/chosen": 0.007490234375, "rewards/margins": -0.00029096554487179505, "rewards/rejected": 0.007781199919871795, "step": 38 }, { "epoch": 0.026739801165581077, "grad_norm": 0.15937110356615897, "kl": 0.0546875, "learning_rate": 1.9000000000000002e-06, "logits/chosen": 133582227.39393939, "logits/rejected": 119199413.67741935, "logps/chosen": -252.84848484848484, "logps/rejected": -262.7096774193548, "loss": 0.4986, "rewards/chosen": -0.0005400686553030303, "rewards/margins": 0.018432020558406644, "rewards/rejected": -0.018972089213709676, "step": 39 }, { "epoch": 0.027425437092903668, "grad_norm": 0.1850534612513403, "kl": 0.04296875, "learning_rate": 1.9500000000000004e-06, "logits/chosen": 163701217.88235295, "logits/rejected": 70743927.46666667, "logps/chosen": -285.6470588235294, "logps/rejected": -262.93333333333334, "loss": 0.4976, "rewards/chosen": 0.026015337775735295, "rewards/margins": 0.020839556525735294, "rewards/rejected": 0.00517578125, "step": 40 }, { "epoch": 0.02811107302022626, "grad_norm": 0.19778284478434435, "kl": 0.01953125, "learning_rate": 2.0000000000000003e-06, "logits/chosen": 132429991.86885247, "logits/rejected": 172029065.5522388, "logps/chosen": -251.80327868852459, "logps/rejected": -345.3134328358209, "loss": 0.4978, "rewards/chosen": 0.007754466572745902, "rewards/margins": 0.017505516013044408, "rewards/rejected": -0.009751049440298507, "step": 41 }, { "epoch": 0.028796708947548853, "grad_norm": 0.180402312692544, "kl": 0.060546875, "learning_rate": 2.05e-06, "logits/chosen": 123432374.85714285, "logits/rejected": 124238177.10344827, "logps/chosen": -253.4857142857143, "logps/rejected": -336.55172413793105, "loss": 0.4967, "rewards/chosen": 0.002064732142857143, "rewards/margins": 1552429.933099215, "rewards/rejected": -1552429.9310344828, "step": 42 }, { "epoch": 0.029482344874871443, "grad_norm": 0.19390737705486502, "kl": 0.029296875, "learning_rate": 2.1000000000000002e-06, "logits/chosen": 102037292.13793103, "logits/rejected": 132060657.37142856, "logps/chosen": -211.0344827586207, "logps/rejected": -296.45714285714286, "loss": 0.4978, "rewards/chosen": 0.004748114224137931, "rewards/margins": 0.015183382081280786, "rewards/rejected": -0.010435267857142856, "step": 43 }, { "epoch": 0.030167980802194034, "grad_norm": 0.18170767048744893, "kl": 0.0, "learning_rate": 2.15e-06, "logits/chosen": 131742401.04918033, "logits/rejected": 109177107.10447761, "logps/chosen": -233.9672131147541, "logps/rejected": -273.910447761194, "loss": 0.4962, "rewards/chosen": 0.012701816246157786, "rewards/margins": 0.029915552253620473, "rewards/rejected": -0.017213736007462687, "step": 44 }, { "epoch": 0.030853616729516628, "grad_norm": 0.18352803806391685, "kl": 0.0859375, "learning_rate": 2.2e-06, "logits/chosen": 140025225.84615386, "logits/rejected": 131155220.31746031, "logps/chosen": -255.01538461538462, "logps/rejected": -321.77777777777777, "loss": 0.4987, "rewards/chosen": -0.0032865084134615385, "rewards/margins": 0.022022736378205128, "rewards/rejected": -0.025309244791666668, "step": 45 }, { "epoch": 0.03153925265683922, "grad_norm": 0.1509150330604722, "kl": 0.0, "learning_rate": 2.25e-06, "logits/chosen": 81788928.0, "logits/rejected": 86245376.0, "logps/chosen": -190.5, "logps/rejected": -216.25, "loss": 0.4996, "rewards/chosen": -0.0052738189697265625, "rewards/margins": 0.0029048919677734375, "rewards/rejected": -0.0081787109375, "step": 46 }, { "epoch": 0.03222488858416181, "grad_norm": 0.19469254620928378, "kl": 0.06640625, "learning_rate": 2.3000000000000004e-06, "logits/chosen": 91384987.15151516, "logits/rejected": 129955773.93548387, "logps/chosen": -243.15151515151516, "logps/rejected": -336.51612903225805, "loss": 0.4973, "rewards/chosen": -0.00579833984375, "rewards/margins": 0.026365218623991937, "rewards/rejected": -0.03216355846774194, "step": 47 }, { "epoch": 0.0329105245114844, "grad_norm": 0.16117403457565546, "kl": 0.0322265625, "learning_rate": 2.35e-06, "logits/chosen": 127044860.28985508, "logits/rejected": 101054290.44067797, "logps/chosen": -265.39130434782606, "logps/rejected": -216.67796610169492, "loss": 0.4983, "rewards/chosen": 0.00011853204257246377, "rewards/margins": 0.014427655453589414, "rewards/rejected": -0.01430912341101695, "step": 48 }, { "epoch": 0.03359616043880699, "grad_norm": 0.181441953819611, "kl": 0.0, "learning_rate": 2.4000000000000003e-06, "logits/chosen": 143843999.47540984, "logits/rejected": 114498239.04477613, "logps/chosen": -296.39344262295083, "logps/rejected": -238.80597014925374, "loss": 0.4984, "rewards/chosen": -0.011910860655737704, "rewards/margins": 0.010980055855456325, "rewards/rejected": -0.02289091651119403, "step": 49 }, { "epoch": 0.03428179636612959, "grad_norm": 0.16980720501100205, "kl": 0.04296875, "learning_rate": 2.4500000000000003e-06, "logits/chosen": 139374659.147541, "logits/rejected": 94403140.7761194, "logps/chosen": -195.80327868852459, "logps/rejected": -288.23880597014926, "loss": 0.4998, "rewards/chosen": 0.006887967469262295, "rewards/margins": 0.00012490776776975798, "rewards/rejected": 0.006763059701492537, "step": 50 }, { "epoch": 0.03496743229345218, "grad_norm": 0.20687520767493486, "kl": 0.0625, "learning_rate": 2.5e-06, "logits/chosen": 131338056.59701492, "logits/rejected": 163234060.59016395, "logps/chosen": -262.92537313432837, "logps/rejected": -380.8524590163934, "loss": 0.4948, "rewards/chosen": -0.006906993353544776, "rewards/margins": 0.04534710500711096, "rewards/rejected": -0.05225409836065574, "step": 51 }, { "epoch": 0.03565306822077477, "grad_norm": 0.17990732640008647, "kl": 0.001953125, "learning_rate": 2.55e-06, "logits/chosen": 109651090.28571428, "logits/rejected": 111872211.86206897, "logps/chosen": -232.68571428571428, "logps/rejected": -292.13793103448273, "loss": 0.4939, "rewards/chosen": 0.014481026785714286, "rewards/margins": 0.05201975254002463, "rewards/rejected": -0.037538725754310345, "step": 52 }, { "epoch": 0.03633870414809736, "grad_norm": 0.20563131670424833, "kl": 0.005859375, "learning_rate": 2.6e-06, "logits/chosen": 140757998.6440678, "logits/rejected": 99812277.79710145, "logps/chosen": -225.4915254237288, "logps/rejected": -315.1304347826087, "loss": 0.4972, "rewards/chosen": 0.002491061970338983, "rewards/margins": 0.020854682894252026, "rewards/rejected": -0.018363620923913044, "step": 53 }, { "epoch": 0.03702434007541995, "grad_norm": 0.23067752643599987, "kl": 0.0, "learning_rate": 2.6500000000000005e-06, "logits/chosen": 122936496.55172414, "logits/rejected": 123012944.45714286, "logps/chosen": -283.86206896551727, "logps/rejected": -344.6857142857143, "loss": 0.4965, "rewards/chosen": -0.005076441271551724, "rewards/margins": 0.02535045604987685, "rewards/rejected": -0.030426897321428573, "step": 54 }, { "epoch": 0.03770997600274254, "grad_norm": 0.2060267702134634, "kl": 0.05078125, "learning_rate": 2.7000000000000004e-06, "logits/chosen": 136814201.9047619, "logits/rejected": 155899053.2923077, "logps/chosen": -286.22222222222223, "logps/rejected": -347.81538461538463, "loss": 0.498, "rewards/chosen": -0.004681299603174603, "rewards/margins": 0.010147426358363857, "rewards/rejected": -0.014828725961538461, "step": 55 }, { "epoch": 0.03839561193006513, "grad_norm": 0.23925138694765485, "kl": 0.048828125, "learning_rate": 2.7500000000000004e-06, "logits/chosen": 131180473.37931034, "logits/rejected": 150635432.22857141, "logps/chosen": -227.0344827586207, "logps/rejected": -377.37142857142857, "loss": 0.4938, "rewards/chosen": 0.004491345635775862, "rewards/margins": 0.04974804206434729, "rewards/rejected": -0.04525669642857143, "step": 56 }, { "epoch": 0.03908124785738773, "grad_norm": 0.17611213230670847, "kl": 0.0, "learning_rate": 2.8000000000000003e-06, "logits/chosen": 140625692.44444445, "logits/rejected": 114144987.42857143, "logps/chosen": -252.66666666666666, "logps/rejected": -299.14285714285717, "loss": 0.4972, "rewards/chosen": -0.0027940538194444445, "rewards/margins": 0.03333875868055555, "rewards/rejected": -0.0361328125, "step": 57 }, { "epoch": 0.03976688378471032, "grad_norm": 0.1966209639688623, "kl": 0.056640625, "learning_rate": 2.85e-06, "logits/chosen": 105346935.46666667, "logits/rejected": 91596197.64705883, "logps/chosen": -187.06666666666666, "logps/rejected": -275.05882352941177, "loss": 0.495, "rewards/chosen": 0.0035481770833333333, "rewards/margins": 0.03764169730392157, "rewards/rejected": -0.03409352022058824, "step": 58 }, { "epoch": 0.04045251971203291, "grad_norm": 0.24110358941906626, "kl": 0.06640625, "learning_rate": 2.9e-06, "logits/chosen": 208205250.56, "logits/rejected": 53490819.28205128, "logps/chosen": -302.4, "logps/rejected": -245.12820512820514, "loss": 0.4943, "rewards/chosen": 0.007030029296875, "rewards/margins": 0.03592626327123398, "rewards/rejected": -0.028896233974358976, "step": 59 }, { "epoch": 0.0411381556393555, "grad_norm": 0.22265918972031376, "kl": 0.0, "learning_rate": 2.95e-06, "logits/chosen": 149815296.0, "logits/rejected": 115081216.0, "logps/chosen": -318.5, "logps/rejected": -269.75, "loss": 0.4928, "rewards/chosen": 0.033782958984375, "rewards/margins": 0.05792236328125, "rewards/rejected": -0.024139404296875, "step": 60 }, { "epoch": 0.04182379156667809, "grad_norm": 0.3218654117238397, "kl": 0.0, "learning_rate": 3e-06, "logits/chosen": 56156160.0, "logits/rejected": 165966279.1111111, "logps/chosen": -317.42857142857144, "logps/rejected": -318.8888888888889, "loss": 0.4926, "rewards/chosen": 0.018423897879464284, "rewards/margins": 0.056150406125992064, "rewards/rejected": -0.037726508246527776, "step": 61 }, { "epoch": 0.04250942749400068, "grad_norm": 0.20124027569305006, "kl": 0.0, "learning_rate": 3.05e-06, "logits/chosen": 76820935.1111111, "logits/rejected": 169719515.42857143, "logps/chosen": -209.77777777777777, "logps/rejected": -340.0, "loss": 0.4899, "rewards/chosen": 0.016167534722222224, "rewards/margins": 0.09059554811507936, "rewards/rejected": -0.07442801339285714, "step": 62 }, { "epoch": 0.04319506342132328, "grad_norm": 0.21396941725615137, "kl": 0.013671875, "learning_rate": 3.1000000000000004e-06, "logits/chosen": 148869829.97333333, "logits/rejected": 93778306.41509435, "logps/chosen": -292.48, "logps/rejected": -283.1698113207547, "loss": 0.4919, "rewards/chosen": 0.035416666666666666, "rewards/margins": 0.06446479461477987, "rewards/rejected": -0.029048127948113206, "step": 63 }, { "epoch": 0.04388069934864587, "grad_norm": 0.2500334888245484, "kl": 0.0, "learning_rate": 3.1500000000000003e-06, "logits/chosen": 175148349.79310346, "logits/rejected": 107494019.65714286, "logps/chosen": -265.1034482758621, "logps/rejected": -288.45714285714286, "loss": 0.4899, "rewards/chosen": 0.02772679822198276, "rewards/margins": 0.08046117322198276, "rewards/rejected": -0.052734375, "step": 64 }, { "epoch": 0.04456633527596846, "grad_norm": 0.23156812007056732, "kl": 0.0166015625, "learning_rate": 3.2000000000000003e-06, "logits/chosen": 145510084.92307693, "logits/rejected": 69454362.94736843, "logps/chosen": -233.84615384615384, "logps/rejected": -230.73684210526315, "loss": 0.4919, "rewards/chosen": 0.013864370492788462, "rewards/margins": 0.06284668957173582, "rewards/rejected": -0.048982319078947366, "step": 65 }, { "epoch": 0.04525197120329105, "grad_norm": 0.2617942684037337, "kl": 0.0, "learning_rate": 3.2500000000000002e-06, "logits/chosen": 146149799.72413793, "logits/rejected": 98865737.14285715, "logps/chosen": -275.0344827586207, "logps/rejected": -340.1142857142857, "loss": 0.4847, "rewards/chosen": 0.04761584051724138, "rewards/margins": 0.1239551262315271, "rewards/rejected": -0.07633928571428572, "step": 66 }, { "epoch": 0.04593760713061364, "grad_norm": 0.22210225254574004, "kl": 0.0, "learning_rate": 3.3000000000000006e-06, "logits/chosen": 167020941.37313432, "logits/rejected": 88596077.1147541, "logps/chosen": -262.92537313432837, "logps/rejected": -309.5081967213115, "loss": 0.4889, "rewards/chosen": 0.01844901469216418, "rewards/margins": 0.09189932206921336, "rewards/rejected": -0.07345030737704918, "step": 67 }, { "epoch": 0.046623243057936234, "grad_norm": 0.21053321762119123, "kl": 0.00390625, "learning_rate": 3.3500000000000005e-06, "logits/chosen": 168448660.6451613, "logits/rejected": 114644309.33333333, "logps/chosen": -285.4193548387097, "logps/rejected": -278.3030303030303, "loss": 0.4929, "rewards/chosen": 0.008820564516129033, "rewards/margins": 0.05838850959188661, "rewards/rejected": -0.049567945075757576, "step": 68 }, { "epoch": 0.047308878985258825, "grad_norm": 0.23448675182386597, "kl": 0.015625, "learning_rate": 3.4000000000000005e-06, "logits/chosen": 96017297.72307692, "logits/rejected": 142340030.98412699, "logps/chosen": -260.9230769230769, "logps/rejected": -354.7936507936508, "loss": 0.484, "rewards/chosen": 0.029507211538461538, "rewards/margins": 0.12846554487179487, "rewards/rejected": -0.09895833333333333, "step": 69 }, { "epoch": 0.04799451491258142, "grad_norm": 0.24995108901621435, "kl": 0.0, "learning_rate": 3.45e-06, "logits/chosen": 130986051.14754099, "logits/rejected": 104043779.82089552, "logps/chosen": -225.5737704918033, "logps/rejected": -298.5074626865672, "loss": 0.4829, "rewards/chosen": 0.016047243212090164, "rewards/margins": 0.1327683253016424, "rewards/rejected": -0.11672108208955224, "step": 70 }, { "epoch": 0.04868015083990401, "grad_norm": 0.22686552992123651, "kl": 0.0, "learning_rate": 3.5e-06, "logits/chosen": 168546493.04615384, "logits/rejected": 88213536.50793651, "logps/chosen": -259.2, "logps/rejected": -282.92063492063494, "loss": 0.4863, "rewards/chosen": 0.02541316105769231, "rewards/margins": 0.11237372653388279, "rewards/rejected": -0.08696056547619048, "step": 71 }, { "epoch": 0.049365786767226603, "grad_norm": 0.2071468591296618, "kl": 0.08984375, "learning_rate": 3.5500000000000003e-06, "logits/chosen": 185073664.0, "logits/rejected": 78970880.0, "logps/chosen": -288.5, "logps/rejected": -294.75, "loss": 0.4879, "rewards/chosen": 0.02375030517578125, "rewards/margins": 0.07610321044921875, "rewards/rejected": -0.0523529052734375, "step": 72 }, { "epoch": 0.050051422694549194, "grad_norm": 0.21475755161504995, "kl": 0.0, "learning_rate": 3.6000000000000003e-06, "logits/chosen": 165449160.86153847, "logits/rejected": 28694365.46031746, "logps/chosen": -279.87692307692305, "logps/rejected": -266.92063492063494, "loss": 0.4882, "rewards/chosen": 0.04033578725961538, "rewards/margins": 0.0987125233707265, "rewards/rejected": -0.05837673611111111, "step": 73 }, { "epoch": 0.050737058621871785, "grad_norm": 0.27680096290959144, "kl": 0.0, "learning_rate": 3.65e-06, "logits/chosen": 135437889.16363636, "logits/rejected": 119623848.32876712, "logps/chosen": -197.23636363636365, "logps/rejected": -275.5068493150685, "loss": 0.4802, "rewards/chosen": 0.03770862926136364, "rewards/margins": 0.14505023885040474, "rewards/rejected": -0.1073416095890411, "step": 74 }, { "epoch": 0.051422694549194375, "grad_norm": 0.18889698865296184, "kl": 0.0, "learning_rate": 3.7e-06, "logits/chosen": 174094009.50724638, "logits/rejected": 81326843.66101696, "logps/chosen": -278.2608695652174, "logps/rejected": -268.20338983050846, "loss": 0.4829, "rewards/chosen": 0.05310235507246377, "rewards/margins": 0.10517465380127733, "rewards/rejected": -0.05207229872881356, "step": 75 }, { "epoch": 0.052108330476516966, "grad_norm": 0.23762658828903402, "kl": 0.0, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 150850312.8275862, "logits/rejected": 66869189.48571429, "logps/chosen": -246.89655172413794, "logps/rejected": -297.6, "loss": 0.4796, "rewards/chosen": 0.05802128232758621, "rewards/margins": 0.15947217518472906, "rewards/rejected": -0.10145089285714286, "step": 76 }, { "epoch": 0.052793966403839564, "grad_norm": 0.24441596801940446, "kl": 0.0, "learning_rate": 3.8000000000000005e-06, "logits/chosen": 61820393.73913044, "logits/rejected": 143175055.18644068, "logps/chosen": -269.6811594202899, "logps/rejected": -290.7118644067797, "loss": 0.4781, "rewards/chosen": 0.05581974637681159, "rewards/margins": 0.18876466163104888, "rewards/rejected": -0.1329449152542373, "step": 77 }, { "epoch": 0.053479602331162154, "grad_norm": 0.2747700127505778, "kl": 0.01953125, "learning_rate": 3.85e-06, "logits/chosen": 130277624.24242425, "logits/rejected": 108848953.80645162, "logps/chosen": -254.54545454545453, "logps/rejected": -271.741935483871, "loss": 0.4774, "rewards/chosen": 0.03688003077651515, "rewards/margins": 0.18846773238941839, "rewards/rejected": -0.15158770161290322, "step": 78 }, { "epoch": 0.054165238258484745, "grad_norm": 0.3567639124205647, "kl": 0.0, "learning_rate": 3.900000000000001e-06, "logits/chosen": 134534279.245283, "logits/rejected": 158209146.88, "logps/chosen": -236.37735849056602, "logps/rejected": -378.4533333333333, "loss": 0.4677, "rewards/chosen": 0.047759433962264154, "rewards/margins": 0.23817610062893083, "rewards/rejected": -0.19041666666666668, "step": 79 }, { "epoch": 0.054850874185807336, "grad_norm": 0.2230656467732242, "kl": 0.029296875, "learning_rate": 3.95e-06, "logits/chosen": 162179754.66666666, "logits/rejected": 79285875.61290322, "logps/chosen": -338.90909090909093, "logps/rejected": -252.6451612903226, "loss": 0.4756, "rewards/chosen": 0.06816702178030302, "rewards/margins": 0.1656972637157869, "rewards/rejected": -0.09753024193548387, "step": 80 }, { "epoch": 0.055536510113129926, "grad_norm": 0.3166838037416667, "kl": 0.09375, "learning_rate": 4.000000000000001e-06, "logits/chosen": 122581916.90322581, "logits/rejected": 157286400.0, "logps/chosen": -248.51612903225808, "logps/rejected": -338.90909090909093, "loss": 0.4631, "rewards/chosen": 0.10105846774193548, "rewards/margins": 0.24002627077223854, "rewards/rejected": -0.13896780303030304, "step": 81 }, { "epoch": 0.05622214604045252, "grad_norm": 0.29066700150929814, "kl": 0.234375, "learning_rate": 4.05e-06, "logits/chosen": 134599028.36363637, "logits/rejected": 147274190.4516129, "logps/chosen": -227.87878787878788, "logps/rejected": -317.4193548387097, "loss": 0.466, "rewards/chosen": 0.08197206439393939, "rewards/margins": 0.30021803213587483, "rewards/rejected": -0.21824596774193547, "step": 82 }, { "epoch": 0.056907781967775115, "grad_norm": 0.2903220084929606, "kl": 0.1796875, "learning_rate": 4.1e-06, "logits/chosen": 115952210.58064516, "logits/rejected": 134980328.72727272, "logps/chosen": -296.7741935483871, "logps/rejected": -314.1818181818182, "loss": 0.4677, "rewards/chosen": 0.03225806451612903, "rewards/margins": -5147554.876832845, "rewards/rejected": 5147554.909090909, "step": 83 }, { "epoch": 0.057593417895097705, "grad_norm": 0.23416322644465812, "kl": 0.0673828125, "learning_rate": 4.15e-06, "logits/chosen": 188011341.2063492, "logits/rejected": 37635812.43076923, "logps/chosen": -224.76190476190476, "logps/rejected": -268.55384615384617, "loss": 0.4706, "rewards/chosen": 0.07955109126984126, "rewards/margins": 0.23916647588522588, "rewards/rejected": -0.1596153846153846, "step": 84 }, { "epoch": 0.058279053822420296, "grad_norm": 0.3224151587837206, "kl": 0.005859375, "learning_rate": 4.2000000000000004e-06, "logits/chosen": 96321823.43859649, "logits/rejected": 134099578.59154929, "logps/chosen": -222.17543859649123, "logps/rejected": -354.2535211267606, "loss": 0.4559, "rewards/chosen": 0.04296875, "rewards/margins": 0.3259793133802817, "rewards/rejected": -0.2830105633802817, "step": 85 }, { "epoch": 0.058964689749742887, "grad_norm": 0.2922828961139356, "kl": 0.0, "learning_rate": 4.25e-06, "logits/chosen": 133255100.85245901, "logits/rejected": 97079357.13432837, "logps/chosen": -229.24590163934425, "logps/rejected": -269.85074626865674, "loss": 0.455, "rewards/chosen": 0.11745965676229508, "rewards/margins": 0.3772544328816981, "rewards/rejected": -0.259794776119403, "step": 86 }, { "epoch": 0.05965032567706548, "grad_norm": 0.28370437707842827, "kl": 0.087890625, "learning_rate": 4.3e-06, "logits/chosen": 140704918.18666667, "logits/rejected": 144030816.6037736, "logps/chosen": -352.0, "logps/rejected": -280.45283018867923, "loss": 0.4584, "rewards/chosen": 0.11515625, "rewards/margins": 0.3831397405660378, "rewards/rejected": -0.26798349056603776, "step": 87 }, { "epoch": 0.06033596160438807, "grad_norm": 0.2763817067750465, "kl": 0.0, "learning_rate": 4.350000000000001e-06, "logits/chosen": 123565527.36507936, "logits/rejected": 110213403.56923077, "logps/chosen": -316.1904761904762, "logps/rejected": -305.2307692307692, "loss": 0.4476, "rewards/chosen": 0.12983630952380953, "rewards/margins": 0.437047847985348, "rewards/rejected": -0.3072115384615385, "step": 88 }, { "epoch": 0.06102159753171066, "grad_norm": 0.2747330249161708, "kl": 0.0, "learning_rate": 4.4e-06, "logits/chosen": 163755080.11267605, "logits/rejected": 63521630.315789476, "logps/chosen": -238.19718309859155, "logps/rejected": -283.2280701754386, "loss": 0.4495, "rewards/chosen": 0.14579665492957747, "rewards/margins": 0.4492506022979985, "rewards/rejected": -0.3034539473684211, "step": 89 }, { "epoch": 0.061707233459033256, "grad_norm": 0.2709849360000464, "kl": 0.0, "learning_rate": 4.450000000000001e-06, "logits/chosen": 101974016.0, "logits/rejected": 98041856.0, "logps/chosen": -242.0, "logps/rejected": -291.5, "loss": 0.4556, "rewards/chosen": 0.04180908203125, "rewards/margins": 0.36895751953125, "rewards/rejected": -0.3271484375, "step": 90 }, { "epoch": 0.06239286938635585, "grad_norm": 0.2624053859853983, "kl": 0.0, "learning_rate": 4.5e-06, "logits/chosen": 200343552.0, "logits/rejected": 60620800.0, "logps/chosen": -252.25, "logps/rejected": -256.5, "loss": 0.4453, "rewards/chosen": 0.1514892578125, "rewards/margins": 0.4044189453125, "rewards/rejected": -0.2529296875, "step": 91 }, { "epoch": 0.06307850531367844, "grad_norm": 0.2906535348150996, "kl": 0.0, "learning_rate": 4.5500000000000005e-06, "logits/chosen": 183750460.95238096, "logits/rejected": 119440872.36923076, "logps/chosen": -321.77777777777777, "logps/rejected": -357.16923076923075, "loss": 0.4396, "rewards/chosen": 0.08076016865079365, "rewards/margins": 0.485567860958486, "rewards/rejected": -0.4048076923076923, "step": 92 }, { "epoch": 0.06376414124100103, "grad_norm": 0.3459467357232646, "kl": 0.0, "learning_rate": 4.600000000000001e-06, "logits/chosen": 121421546.30508475, "logits/rejected": 170933084.7536232, "logps/chosen": -237.28813559322035, "logps/rejected": -379.82608695652175, "loss": 0.4247, "rewards/chosen": 0.0855568061440678, "rewards/margins": 0.61408941483972, "rewards/rejected": -0.5285326086956522, "step": 93 }, { "epoch": 0.06444977716832362, "grad_norm": 0.3185761942830034, "kl": 0.0, "learning_rate": 4.65e-06, "logits/chosen": 131954135.36507936, "logits/rejected": 121699343.75384615, "logps/chosen": -210.28571428571428, "logps/rejected": -270.7692307692308, "loss": 0.4268, "rewards/chosen": 0.08717757936507936, "rewards/margins": 0.39294681013431015, "rewards/rejected": -0.3057692307692308, "step": 94 }, { "epoch": 0.06513541309564622, "grad_norm": 0.22559584118965412, "kl": 0.0, "learning_rate": 4.7e-06, "logits/chosen": 149761325.17647058, "logits/rejected": 44802594.13333333, "logps/chosen": -210.35294117647058, "logps/rejected": -236.26666666666668, "loss": 0.4413, "rewards/chosen": 0.0935202205882353, "rewards/margins": 0.5299785539215687, "rewards/rejected": -0.43645833333333334, "step": 95 }, { "epoch": 0.0658210490229688, "grad_norm": 0.3006165966502515, "kl": 0.0, "learning_rate": 4.75e-06, "logits/chosen": 175424925.19298247, "logits/rejected": 81848002.70422535, "logps/chosen": -306.2456140350877, "logps/rejected": -275.38028169014086, "loss": 0.4197, "rewards/chosen": 0.15090460526315788, "rewards/margins": 0.6733517883617494, "rewards/rejected": -0.5224471830985915, "step": 96 }, { "epoch": 0.0665066849502914, "grad_norm": 0.28211971946876335, "kl": 0.0, "learning_rate": 4.800000000000001e-06, "logits/chosen": 125415597.07042253, "logits/rejected": 141649740.3508772, "logps/chosen": -307.15492957746477, "logps/rejected": -342.4561403508772, "loss": 0.4017, "rewards/chosen": 0.23771731954225353, "rewards/margins": 0.966335740594885, "rewards/rejected": -0.7286184210526315, "step": 97 }, { "epoch": 0.06719232087761398, "grad_norm": 0.24705553803693386, "kl": 0.0, "learning_rate": 4.85e-06, "logits/chosen": 100790396.12121212, "logits/rejected": 114599209.29032259, "logps/chosen": -210.42424242424244, "logps/rejected": -251.09677419354838, "loss": 0.4383, "rewards/chosen": 0.07703006628787878, "rewards/margins": 0.5483002275782014, "rewards/rejected": -0.47127016129032256, "step": 98 }, { "epoch": 0.06787795680493658, "grad_norm": 0.28721865391417556, "kl": 0.0, "learning_rate": 4.9000000000000005e-06, "logits/chosen": 128304446.95081967, "logits/rejected": 158882739.58208954, "logps/chosen": -239.47540983606558, "logps/rejected": -346.2686567164179, "loss": 0.403, "rewards/chosen": 0.10905481557377049, "rewards/margins": 0.6351742185588451, "rewards/rejected": -0.5261194029850746, "step": 99 }, { "epoch": 0.06856359273225918, "grad_norm": 0.26692936241843873, "kl": 0.0, "learning_rate": 4.95e-06, "logits/chosen": 120695792.71641791, "logits/rejected": 108639349.50819673, "logps/chosen": -224.47761194029852, "logps/rejected": -263.60655737704917, "loss": 0.4093, "rewards/chosen": 0.12628264925373134, "rewards/margins": 0.8414465836799608, "rewards/rejected": -0.7151639344262295, "step": 100 }, { "epoch": 0.06924922865958176, "grad_norm": 0.2537646231747368, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 170640323.7647059, "logits/rejected": 105486745.6, "logps/chosen": -232.0, "logps/rejected": -268.26666666666665, "loss": 0.4181, "rewards/chosen": 0.07795266544117647, "rewards/margins": 0.6904526654411766, "rewards/rejected": -0.6125, "step": 101 }, { "epoch": 0.06993486458690436, "grad_norm": 0.286647329787637, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121183121.72307692, "logits/rejected": 96335839.49206349, "logps/chosen": -293.16923076923075, "logps/rejected": -281.6507936507937, "loss": 0.3955, "rewards/chosen": 0.18892728365384615, "rewards/margins": 0.7901177598443223, "rewards/rejected": -0.6011904761904762, "step": 102 }, { "epoch": 0.07062050051422694, "grad_norm": 0.22659868199967884, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131677515.71830986, "logits/rejected": 160413731.92982456, "logps/chosen": -218.81690140845072, "logps/rejected": -324.49122807017545, "loss": 0.3894, "rewards/chosen": 0.19803587147887325, "rewards/margins": 0.7769832398999259, "rewards/rejected": -0.5789473684210527, "step": 103 }, { "epoch": 0.07130613644154954, "grad_norm": 0.25401134227623573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117509271.08196722, "logits/rejected": 142856742.20895523, "logps/chosen": -223.21311475409837, "logps/rejected": -344.35820895522386, "loss": 0.3909, "rewards/chosen": 0.04620261270491803, "rewards/margins": 0.8540384336004404, "rewards/rejected": -0.8078358208955224, "step": 104 }, { "epoch": 0.07199177236887212, "grad_norm": 0.28502941835068013, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111770434.37037037, "logits/rejected": 156889641.5135135, "logps/chosen": -230.37037037037038, "logps/rejected": -359.35135135135135, "loss": 0.3753, "rewards/chosen": 0.00734230324074074, "rewards/margins": 1.0765990599974975, "rewards/rejected": -1.0692567567567568, "step": 105 }, { "epoch": 0.07267740829619472, "grad_norm": 0.22427409911347546, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98598407.87692308, "logits/rejected": 116375291.93650794, "logps/chosen": -223.5076923076923, "logps/rejected": -317.968253968254, "loss": 0.3928, "rewards/chosen": 0.04128605769230769, "rewards/margins": 0.876603518009768, "rewards/rejected": -0.8353174603174603, "step": 106 }, { "epoch": 0.07336304422351732, "grad_norm": 0.22238614161713494, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88180248.38095239, "logits/rejected": 63721156.92307692, "logps/chosen": -234.9206349206349, "logps/rejected": -231.87692307692308, "loss": 0.3945, "rewards/chosen": 0.09213789682539683, "rewards/margins": 1.0402148199023198, "rewards/rejected": -0.948076923076923, "step": 107 }, { "epoch": 0.0740486801508399, "grad_norm": 0.22954952830695444, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107324837.64705883, "logits/rejected": 111848106.66666667, "logps/chosen": -243.05882352941177, "logps/rejected": -310.1333333333333, "loss": 0.3841, "rewards/chosen": 0.0964211856617647, "rewards/margins": 1.3224628523284314, "rewards/rejected": -1.2260416666666667, "step": 108 }, { "epoch": 0.0747343160781625, "grad_norm": 0.21828644327330474, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157533123.7647059, "logits/rejected": 80128682.66666667, "logps/chosen": -208.7058823529412, "logps/rejected": -305.06666666666666, "loss": 0.3799, "rewards/chosen": 0.14652745863970587, "rewards/margins": 1.3319441253063724, "rewards/rejected": -1.1854166666666666, "step": 109 }, { "epoch": 0.07541995200548508, "grad_norm": 0.2183319227796815, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160465953.03225806, "logits/rejected": 80835677.0909091, "logps/chosen": -263.48387096774195, "logps/rejected": -269.3333333333333, "loss": 0.3623, "rewards/chosen": 0.1305443548387097, "rewards/margins": 1.4866049608993157, "rewards/rejected": -1.356060606060606, "step": 110 }, { "epoch": 0.07610558793280768, "grad_norm": 0.22043472626555843, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131554677.84126984, "logits/rejected": 92016576.98461539, "logps/chosen": -210.53968253968253, "logps/rejected": -311.6307692307692, "loss": 0.3735, "rewards/chosen": 0.1039186507936508, "rewards/margins": 1.3616109584859584, "rewards/rejected": -1.2576923076923077, "step": 111 }, { "epoch": 0.07679122386013026, "grad_norm": 0.2316992216697822, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 196350299.11864406, "logits/rejected": 47292297.27536232, "logps/chosen": -288.0, "logps/rejected": -291.94202898550725, "loss": 0.3586, "rewards/chosen": 0.18862552966101695, "rewards/margins": 1.5436979934291328, "rewards/rejected": -1.355072463768116, "step": 112 }, { "epoch": 0.07747685978745286, "grad_norm": 0.1813002192029756, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68307236.57142857, "logits/rejected": 138412032.0, "logps/chosen": -216.45714285714286, "logps/rejected": -310.62068965517244, "loss": 0.3896, "rewards/chosen": -0.004241071428571428, "rewards/margins": 1.088431342364532, "rewards/rejected": -1.0926724137931034, "step": 113 }, { "epoch": 0.07816249571477546, "grad_norm": 0.202114028104285, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102726068.4590164, "logits/rejected": 156629083.70149255, "logps/chosen": -242.62295081967213, "logps/rejected": -332.8955223880597, "loss": 0.3302, "rewards/chosen": 0.19676133452868852, "rewards/margins": 2.0624329763197333, "rewards/rejected": -1.8656716417910448, "step": 114 }, { "epoch": 0.07884813164209804, "grad_norm": 0.1755392550077989, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100723214.62857144, "logits/rejected": 109413481.93103448, "logps/chosen": -251.65714285714284, "logps/rejected": -356.41379310344826, "loss": 0.3714, "rewards/chosen": 0.16212332589285713, "rewards/margins": 1.6610457396859606, "rewards/rejected": -1.4989224137931034, "step": 115 }, { "epoch": 0.07953376756942064, "grad_norm": 0.21156716356083197, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77226702.59649123, "logits/rejected": 131736590.42253521, "logps/chosen": -191.57894736842104, "logps/rejected": -330.8169014084507, "loss": 0.3452, "rewards/chosen": 0.13294956140350878, "rewards/margins": 1.6910481529528045, "rewards/rejected": -1.5580985915492958, "step": 116 }, { "epoch": 0.08021940349674322, "grad_norm": 0.18048491779371242, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 65260526.644067794, "logits/rejected": 133974579.94202898, "logps/chosen": -268.47457627118644, "logps/rejected": -313.27536231884056, "loss": 0.3505, "rewards/chosen": 0.07759533898305085, "rewards/margins": 1.8275953389830508, "rewards/rejected": -1.75, "step": 117 }, { "epoch": 0.08090503942406582, "grad_norm": 0.20793682339073916, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 180583852.21818182, "logits/rejected": 39300053.91780822, "logps/chosen": -286.54545454545456, "logps/rejected": -288.0, "loss": 0.3439, "rewards/chosen": 0.014275568181818183, "rewards/margins": 1.615302965442092, "rewards/rejected": -1.601027397260274, "step": 118 }, { "epoch": 0.0815906753513884, "grad_norm": 0.15674243322944367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 67817760.45070423, "logits/rejected": 95126078.87719299, "logps/chosen": -251.26760563380282, "logps/rejected": -269.1929824561403, "loss": 0.3681, "rewards/chosen": 0.06547095070422536, "rewards/margins": 1.7781902489498393, "rewards/rejected": -1.712719298245614, "step": 119 }, { "epoch": 0.082276311278711, "grad_norm": 0.196838648835713, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97933669.58730158, "logits/rejected": 113246208.0, "logps/chosen": -181.46031746031747, "logps/rejected": -371.2, "loss": 0.3109, "rewards/chosen": 0.27356150793650796, "rewards/margins": 1.8908692002442002, "rewards/rejected": -1.6173076923076923, "step": 120 }, { "epoch": 0.0829619472060336, "grad_norm": 0.15776224924811957, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94888529.6231884, "logits/rejected": 141042358.23728815, "logps/chosen": -250.20289855072463, "logps/rejected": -350.64406779661016, "loss": 0.3625, "rewards/chosen": -0.001585144927536232, "rewards/margins": 1.929558922869074, "rewards/rejected": -1.93114406779661, "step": 121 }, { "epoch": 0.08364758313335618, "grad_norm": 0.1550601106103246, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123338752.0, "logits/rejected": 99549184.0, "logps/chosen": -239.25, "logps/rejected": -291.0, "loss": 0.3402, "rewards/chosen": 0.12530517578125, "rewards/margins": 2.16046142578125, "rewards/rejected": -2.03515625, "step": 122 }, { "epoch": 0.08433321906067878, "grad_norm": 0.16420295147600614, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75428712.91803278, "logits/rejected": 134718540.41791046, "logps/chosen": -216.91803278688525, "logps/rejected": -272.23880597014926, "loss": 0.3249, "rewards/chosen": 0.06634221311475409, "rewards/margins": 2.0831332578908732, "rewards/rejected": -2.0167910447761193, "step": 123 }, { "epoch": 0.08501885498800137, "grad_norm": 0.15605366784636743, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117593032.14545454, "logits/rejected": 162601100.2739726, "logps/chosen": -256.8727272727273, "logps/rejected": -358.13698630136986, "loss": 0.309, "rewards/chosen": 0.03146306818181818, "rewards/margins": 2.2900247120174346, "rewards/rejected": -2.2585616438356166, "step": 124 }, { "epoch": 0.08570449091532396, "grad_norm": 0.1528532606337219, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110704205.57575758, "logits/rejected": 154648047.48387095, "logps/chosen": -245.0909090909091, "logps/rejected": -331.8709677419355, "loss": 0.3328, "rewards/chosen": 0.24091500946969696, "rewards/margins": 2.714705332050342, "rewards/rejected": -2.473790322580645, "step": 125 }, { "epoch": 0.08639012684264656, "grad_norm": 0.17565568417717967, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103581072.69565217, "logits/rejected": 134075548.20338982, "logps/chosen": -243.2463768115942, "logps/rejected": -288.271186440678, "loss": 0.3399, "rewards/chosen": 0.1465126811594203, "rewards/margins": 2.3626143760746747, "rewards/rejected": -2.2161016949152543, "step": 126 }, { "epoch": 0.08707576276996914, "grad_norm": 0.19552399559774564, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 193497224.53333333, "logits/rejected": 101033381.64705883, "logps/chosen": -290.4, "logps/rejected": -315.7647058823529, "loss": 0.3235, "rewards/chosen": 0.12395833333333334, "rewards/margins": 2.2030024509803923, "rewards/rejected": -2.079044117647059, "step": 127 }, { "epoch": 0.08776139869729174, "grad_norm": 0.15309181575781405, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 63603901.62962963, "logits/rejected": 133877649.2972973, "logps/chosen": -191.40740740740742, "logps/rejected": -330.3783783783784, "loss": 0.3116, "rewards/chosen": -0.017867476851851853, "rewards/margins": 2.309835225850851, "rewards/rejected": -2.3277027027027026, "step": 128 }, { "epoch": 0.08844703462461433, "grad_norm": 0.15489356124679152, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111581801.65079366, "logits/rejected": 140541447.87692308, "logps/chosen": -241.77777777777777, "logps/rejected": -350.03076923076924, "loss": 0.3067, "rewards/chosen": 0.14598834325396826, "rewards/margins": 2.172911420177045, "rewards/rejected": -2.026923076923077, "step": 129 }, { "epoch": 0.08913267055193692, "grad_norm": 0.17347412595363307, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118279372.8, "logits/rejected": 102822128.94117647, "logps/chosen": -225.86666666666667, "logps/rejected": -301.1764705882353, "loss": 0.2921, "rewards/chosen": 0.459375, "rewards/margins": 2.786580882352941, "rewards/rejected": -2.327205882352941, "step": 130 }, { "epoch": 0.08981830647925951, "grad_norm": 0.1793851697528189, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83960978.28571428, "logits/rejected": 154956231.1111111, "logps/chosen": -279.42857142857144, "logps/rejected": -384.8888888888889, "loss": 0.2831, "rewards/chosen": 0.5853794642857143, "rewards/margins": 2.4464905753968256, "rewards/rejected": -1.8611111111111112, "step": 131 }, { "epoch": 0.0905039424065821, "grad_norm": 0.15079451316382866, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138691652.26666668, "logits/rejected": 69822825.41176471, "logps/chosen": -268.0, "logps/rejected": -313.6470588235294, "loss": 0.3089, "rewards/chosen": 0.4119140625, "rewards/margins": 2.2685317095588236, "rewards/rejected": -1.8566176470588236, "step": 132 }, { "epoch": 0.0911895783339047, "grad_norm": 0.13852883913927555, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90619041.68421052, "logits/rejected": 103617031.2112676, "logps/chosen": -245.33333333333334, "logps/rejected": -328.11267605633805, "loss": 0.312, "rewards/chosen": 0.12621641995614036, "rewards/margins": 2.770582617139239, "rewards/rejected": -2.6443661971830985, "step": 133 }, { "epoch": 0.09187521426122729, "grad_norm": 0.1582954154400507, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138599836.6567164, "logits/rejected": 119606423.08196722, "logps/chosen": -244.4179104477612, "logps/rejected": -316.59016393442624, "loss": 0.3047, "rewards/chosen": 0.3500466417910448, "rewards/margins": 3.038571231954979, "rewards/rejected": -2.6885245901639343, "step": 134 }, { "epoch": 0.09256085018854988, "grad_norm": 0.15427193688247406, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114624336.45714286, "logits/rejected": 72966426.48275863, "logps/chosen": -263.54285714285714, "logps/rejected": -268.6896551724138, "loss": 0.3146, "rewards/chosen": 0.6151785714285715, "rewards/margins": 2.5117302955665024, "rewards/rejected": -1.896551724137931, "step": 135 }, { "epoch": 0.09324648611587247, "grad_norm": 0.14001737544849904, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127066783.47540984, "logits/rejected": 101007604.53731343, "logps/chosen": -224.2622950819672, "logps/rejected": -286.56716417910445, "loss": 0.3177, "rewards/chosen": 0.2041495901639344, "rewards/margins": 2.7395973513579643, "rewards/rejected": -2.53544776119403, "step": 136 }, { "epoch": 0.09393212204319507, "grad_norm": 0.17295156064789252, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128974848.0, "logits/rejected": 40951143.783783786, "logps/chosen": -240.59259259259258, "logps/rejected": -268.97297297297297, "loss": 0.2987, "rewards/chosen": 0.5391348379629629, "rewards/margins": 2.8668375406656654, "rewards/rejected": -2.3277027027027026, "step": 137 }, { "epoch": 0.09461775797051765, "grad_norm": 0.1580932907546033, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 170748762.83870968, "logits/rejected": 131548625.45454545, "logps/chosen": -217.5483870967742, "logps/rejected": -302.06060606060606, "loss": 0.301, "rewards/chosen": 0.25784400201612906, "rewards/margins": 2.780571274743402, "rewards/rejected": -2.522727272727273, "step": 138 }, { "epoch": 0.09530339389784025, "grad_norm": 0.15604596829782152, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130949439.16883117, "logits/rejected": 123773088.62745099, "logps/chosen": -278.02597402597405, "logps/rejected": -344.7843137254902, "loss": 0.3331, "rewards/chosen": 0.3005275974025974, "rewards/margins": 3.1436648523045583, "rewards/rejected": -2.843137254901961, "step": 139 }, { "epoch": 0.09598902982516284, "grad_norm": 0.14743076761278903, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109763437.71428572, "logits/rejected": 137013930.66666666, "logps/chosen": -222.71428571428572, "logps/rejected": -386.6666666666667, "loss": 0.2681, "rewards/chosen": 0.44308035714285715, "rewards/margins": 3.4534970238095237, "rewards/rejected": -3.0104166666666665, "step": 140 }, { "epoch": 0.09667466575248543, "grad_norm": 0.14278958530071315, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113306995.01449275, "logits/rejected": 159525731.79661018, "logps/chosen": -256.69565217391306, "logps/rejected": -341.4237288135593, "loss": 0.3009, "rewards/chosen": 0.33163496376811596, "rewards/margins": 3.348584116310489, "rewards/rejected": -3.016949152542373, "step": 141 }, { "epoch": 0.09736030167980803, "grad_norm": 0.16147246884956762, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139810133.33333334, "logits/rejected": 104558006.85714285, "logps/chosen": -288.0, "logps/rejected": -318.85714285714283, "loss": 0.3569, "rewards/chosen": 0.1393771701388889, "rewards/margins": 2.514377170138889, "rewards/rejected": -2.375, "step": 142 }, { "epoch": 0.09804593760713061, "grad_norm": 0.15321223384493124, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105497409.08474576, "logits/rejected": 130448933.10144928, "logps/chosen": -219.38983050847457, "logps/rejected": -330.2028985507246, "loss": 0.2653, "rewards/chosen": 0.3263373940677966, "rewards/margins": 3.7140185534880867, "rewards/rejected": -3.38768115942029, "step": 143 }, { "epoch": 0.09873157353445321, "grad_norm": 0.14438162123370515, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92495440.84210527, "logits/rejected": 150049748.73239437, "logps/chosen": -256.0, "logps/rejected": -377.6901408450704, "loss": 0.2307, "rewards/chosen": 0.41639254385964913, "rewards/margins": 4.416392543859649, "rewards/rejected": -4.0, "step": 144 }, { "epoch": 0.09941720946177579, "grad_norm": 0.1615725753129935, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99148686.22222222, "logits/rejected": 127776475.42857143, "logps/chosen": -214.44444444444446, "logps/rejected": -344.57142857142856, "loss": 0.2967, "rewards/chosen": 0.3564453125, "rewards/margins": 3.9412667410714284, "rewards/rejected": -3.5848214285714284, "step": 145 }, { "epoch": 0.10010284538909839, "grad_norm": 0.13959033135524448, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120630546.02816902, "logits/rejected": 72379338.10526316, "logps/chosen": -219.94366197183098, "logps/rejected": -328.70175438596493, "loss": 0.3114, "rewards/chosen": 0.6529489436619719, "rewards/margins": 2.8196156103286385, "rewards/rejected": -2.1666666666666665, "step": 146 }, { "epoch": 0.10078848131642099, "grad_norm": 0.13128654623570427, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132513792.0, "logits/rejected": 114819072.0, "logps/chosen": -235.0, "logps/rejected": -363.25, "loss": 0.2754, "rewards/chosen": 0.385498046875, "rewards/margins": 3.737060546875, "rewards/rejected": -3.3515625, "step": 147 }, { "epoch": 0.10147411724374357, "grad_norm": 0.1599624022136523, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 169336137.76271185, "logits/rejected": 106438062.3768116, "logps/chosen": -325.4237288135593, "logps/rejected": -384.92753623188406, "loss": 0.2978, "rewards/chosen": 0.2341101694915254, "rewards/margins": 3.44063190862196, "rewards/rejected": -3.2065217391304346, "step": 148 }, { "epoch": 0.10215975317106617, "grad_norm": 0.14254743923390023, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133073826.9090909, "logits/rejected": 99377944.77419356, "logps/chosen": -288.24242424242425, "logps/rejected": -306.83870967741933, "loss": 0.301, "rewards/chosen": 0.3566524621212121, "rewards/margins": 3.4897169782502444, "rewards/rejected": -3.1330645161290325, "step": 149 }, { "epoch": 0.10284538909838875, "grad_norm": 0.12946918512657346, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149449674.10526314, "logits/rejected": 157256862.64788732, "logps/chosen": -312.140350877193, "logps/rejected": -379.0422535211268, "loss": 0.2646, "rewards/chosen": 0.24671052631578946, "rewards/margins": 3.306569681245367, "rewards/rejected": -3.0598591549295775, "step": 150 }, { "epoch": 0.10353102502571135, "grad_norm": 0.16770797814309352, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110775590.57534246, "logits/rejected": 144932268.21818182, "logps/chosen": -306.8493150684931, "logps/rejected": -279.7090909090909, "loss": 0.314, "rewards/chosen": 0.5261130136986302, "rewards/margins": 3.632931195516812, "rewards/rejected": -3.106818181818182, "step": 151 }, { "epoch": 0.10421666095303393, "grad_norm": 0.14872220116567406, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156128271.2835821, "logits/rejected": 90624470.03278689, "logps/chosen": -299.94029850746267, "logps/rejected": -368.78688524590166, "loss": 0.2911, "rewards/chosen": 0.7122201492537313, "rewards/margins": 3.7818922804012725, "rewards/rejected": -3.069672131147541, "step": 152 }, { "epoch": 0.10490229688035653, "grad_norm": 0.1564685353704582, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97961879.86440678, "logits/rejected": 73825829.10144928, "logps/chosen": -257.6271186440678, "logps/rejected": -287.07246376811594, "loss": 0.2982, "rewards/chosen": 0.3736758474576271, "rewards/margins": 3.377299035863424, "rewards/rejected": -3.003623188405797, "step": 153 }, { "epoch": 0.10558793280767913, "grad_norm": 0.1470022542160791, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136183808.0, "logits/rejected": 94765056.0, "logps/chosen": -250.0, "logps/rejected": -335.0, "loss": 0.2826, "rewards/chosen": 0.5208740234375, "rewards/margins": 3.8919677734375, "rewards/rejected": -3.37109375, "step": 154 }, { "epoch": 0.10627356873500171, "grad_norm": 0.1557871602466489, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 187695104.0, "logits/rejected": 50200576.0, "logps/chosen": -292.75, "logps/rejected": -337.0, "loss": 0.2794, "rewards/chosen": 0.351806640625, "rewards/margins": 4.152587890625, "rewards/rejected": -3.80078125, "step": 155 }, { "epoch": 0.10695920466232431, "grad_norm": 0.17183527837025378, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119268798.35897435, "logits/rejected": 138244259.84, "logps/chosen": -262.97435897435895, "logps/rejected": -365.44, "loss": 0.3102, "rewards/chosen": 0.42788461538461536, "rewards/margins": 4.547884615384615, "rewards/rejected": -4.12, "step": 156 }, { "epoch": 0.10764484058964689, "grad_norm": 0.21133617227436438, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117670336.87671232, "logits/rejected": 138030731.63636363, "logps/chosen": -236.4931506849315, "logps/rejected": -337.74545454545455, "loss": 0.2904, "rewards/chosen": 0.7534246575342466, "rewards/margins": 4.31706102117061, "rewards/rejected": -3.5636363636363635, "step": 157 }, { "epoch": 0.10833047651696949, "grad_norm": 0.13532589394941522, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162369327.72881356, "logits/rejected": 66440206.84057971, "logps/chosen": -253.28813559322035, "logps/rejected": -266.8985507246377, "loss": 0.2522, "rewards/chosen": 0.4997351694915254, "rewards/margins": 3.6084308216654386, "rewards/rejected": -3.108695652173913, "step": 158 }, { "epoch": 0.10901611244429209, "grad_norm": 0.1557963594221059, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155371609.04347825, "logits/rejected": 72938235.66101696, "logps/chosen": -219.59420289855072, "logps/rejected": -337.08474576271186, "loss": 0.2659, "rewards/chosen": 0.45714447463768115, "rewards/margins": 4.2707037966715795, "rewards/rejected": -3.8135593220338984, "step": 159 }, { "epoch": 0.10970174837161467, "grad_norm": 0.14694615452853807, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137363456.0, "logits/rejected": 93023670.85714285, "logps/chosen": -223.11111111111111, "logps/rejected": -320.2857142857143, "loss": 0.2878, "rewards/chosen": 0.6720920138888888, "rewards/margins": 3.453342013888889, "rewards/rejected": -2.78125, "step": 160 }, { "epoch": 0.11038738429893727, "grad_norm": 0.16430520773556426, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152992231.6190476, "logits/rejected": 155640942.2769231, "logps/chosen": -259.04761904761904, "logps/rejected": -364.55384615384617, "loss": 0.2666, "rewards/chosen": 0.5679563492063492, "rewards/margins": 3.0025717338217337, "rewards/rejected": -2.4346153846153844, "step": 161 }, { "epoch": 0.11107302022625985, "grad_norm": 0.13964934119423184, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139210947.04761904, "logits/rejected": 81369497.6, "logps/chosen": -322.53968253968253, "logps/rejected": -350.2769230769231, "loss": 0.2599, "rewards/chosen": 0.8040674603174603, "rewards/margins": 3.515605921855922, "rewards/rejected": -2.7115384615384617, "step": 162 }, { "epoch": 0.11175865615358245, "grad_norm": 0.16476928315619513, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143538403.55555555, "logits/rejected": 63719716.571428575, "logps/chosen": -241.33333333333334, "logps/rejected": -312.0, "loss": 0.2842, "rewards/chosen": 0.7634548611111112, "rewards/margins": 4.209883432539683, "rewards/rejected": -3.4464285714285716, "step": 163 }, { "epoch": 0.11244429208090503, "grad_norm": 0.1602645435148409, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141897439.54929578, "logits/rejected": 78183298.24561404, "logps/chosen": -211.83098591549296, "logps/rejected": -345.82456140350877, "loss": 0.2841, "rewards/chosen": 0.7130281690140845, "rewards/margins": -23510498.093989372, "rewards/rejected": 23510498.807017542, "step": 164 }, { "epoch": 0.11312992800822763, "grad_norm": 0.17292099720219975, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159252480.0, "logits/rejected": 84541440.0, "logps/chosen": -214.75, "logps/rejected": -404.5, "loss": 0.2716, "rewards/chosen": 0.35577392578125, "rewards/margins": 4.18780517578125, "rewards/rejected": -3.83203125, "step": 165 }, { "epoch": 0.11381556393555023, "grad_norm": 0.1721078491194573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103331547.42857143, "logits/rejected": 130547712.0, "logps/chosen": -251.14285714285714, "logps/rejected": -344.8888888888889, "loss": 0.2606, "rewards/chosen": 0.18722098214285715, "rewards/margins": 4.051804315476191, "rewards/rejected": -3.8645833333333335, "step": 166 }, { "epoch": 0.11450119986287281, "grad_norm": 0.13935831019163947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112475196.23529412, "logits/rejected": 112127726.93333334, "logps/chosen": -265.88235294117646, "logps/rejected": -341.06666666666666, "loss": 0.2678, "rewards/chosen": 0.6245404411764706, "rewards/margins": 4.691207107843137, "rewards/rejected": -4.066666666666666, "step": 167 }, { "epoch": 0.11518683579019541, "grad_norm": 0.1745534730271643, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127101163.01639344, "logits/rejected": 85208537.79104477, "logps/chosen": -243.9344262295082, "logps/rejected": -360.1194029850746, "loss": 0.2541, "rewards/chosen": 0.5117827868852459, "rewards/margins": 4.299096219721067, "rewards/rejected": -3.787313432835821, "step": 168 }, { "epoch": 0.115872471717518, "grad_norm": 0.13675752116500475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110633362.8852459, "logits/rejected": 137973821.13432837, "logps/chosen": -265.44262295081967, "logps/rejected": -311.8805970149254, "loss": 0.2616, "rewards/chosen": 0.6550973360655737, "rewards/margins": 3.9349480823342304, "rewards/rejected": -3.279850746268657, "step": 169 }, { "epoch": 0.11655810764484059, "grad_norm": 0.2085204445827831, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110162160.94117647, "logits/rejected": 112686967.46666667, "logps/chosen": -282.5882352941176, "logps/rejected": -408.53333333333336, "loss": 0.2978, "rewards/chosen": 0.3400735294117647, "rewards/margins": -13614144.726593137, "rewards/rejected": 13614145.066666666, "step": 170 }, { "epoch": 0.11724374357216318, "grad_norm": 0.16415507472229557, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167891997.25714287, "logits/rejected": 81861243.5862069, "logps/chosen": -310.1714285714286, "logps/rejected": -336.0, "loss": 0.2583, "rewards/chosen": 0.9522321428571429, "rewards/margins": 4.5082666256157635, "rewards/rejected": -3.5560344827586206, "step": 171 }, { "epoch": 0.11792937949948577, "grad_norm": 0.17203900847061196, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153057716.45901638, "logits/rejected": 89269813.49253732, "logps/chosen": -269.37704918032784, "logps/rejected": -348.17910447761193, "loss": 0.256, "rewards/chosen": 0.5476434426229508, "rewards/margins": 3.3125688157572792, "rewards/rejected": -2.764925373134328, "step": 172 }, { "epoch": 0.11861501542680837, "grad_norm": 0.16020447061952445, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153715573.6216216, "logits/rejected": 73477992.2962963, "logps/chosen": -253.83783783783784, "logps/rejected": -354.3703703703704, "loss": 0.2733, "rewards/chosen": 0.5717905405405406, "rewards/margins": 4.82642017017017, "rewards/rejected": -4.25462962962963, "step": 173 }, { "epoch": 0.11930065135413095, "grad_norm": 0.14992935206423222, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173911648.46376812, "logits/rejected": 76066191.18644068, "logps/chosen": -304.463768115942, "logps/rejected": -318.3728813559322, "loss": 0.2543, "rewards/chosen": 0.9438405797101449, "rewards/margins": 5.00316261360845, "rewards/rejected": -4.059322033898305, "step": 174 }, { "epoch": 0.11998628728145355, "grad_norm": 0.15225141514708448, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150501496.47058824, "logits/rejected": 109051904.0, "logps/chosen": -233.88235294117646, "logps/rejected": -322.93333333333334, "loss": 0.2738, "rewards/chosen": 0.4572610294117647, "rewards/margins": 4.661427696078431, "rewards/rejected": -4.204166666666667, "step": 175 }, { "epoch": 0.12067192320877614, "grad_norm": 0.17430397790336816, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95158272.0, "logits/rejected": 98107392.0, "logps/chosen": -192.5, "logps/rejected": -318.0, "loss": 0.2373, "rewards/chosen": 0.634765625, "rewards/margins": 4.876953125, "rewards/rejected": -4.2421875, "step": 176 }, { "epoch": 0.12135755913609873, "grad_norm": 0.1559911436196411, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119505400.12307692, "logits/rejected": 88812722.79365079, "logps/chosen": -253.2923076923077, "logps/rejected": -323.3015873015873, "loss": 0.275, "rewards/chosen": 0.42740384615384613, "rewards/margins": 4.24089590964591, "rewards/rejected": -3.8134920634920637, "step": 177 }, { "epoch": 0.12204319506342132, "grad_norm": 0.15356000977030246, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157343079.7837838, "logits/rejected": 53788065.18518519, "logps/chosen": -246.9189189189189, "logps/rejected": -344.0, "loss": 0.2635, "rewards/chosen": 0.7174831081081081, "rewards/margins": 4.393409034034034, "rewards/rejected": -3.675925925925926, "step": 178 }, { "epoch": 0.12272883099074391, "grad_norm": 0.16458769572801266, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79613524.05970149, "logits/rejected": 176298286.16393444, "logps/chosen": -228.29850746268656, "logps/rejected": -401.3114754098361, "loss": 0.2281, "rewards/chosen": 0.710820895522388, "rewards/margins": 4.9813126988010765, "rewards/rejected": -4.270491803278689, "step": 179 }, { "epoch": 0.12341446691806651, "grad_norm": 0.15931616151110928, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146478001.23076922, "logits/rejected": 134659233.68421054, "logps/chosen": -268.3076923076923, "logps/rejected": -349.89473684210526, "loss": 0.2402, "rewards/chosen": 0.4191706730769231, "rewards/margins": 4.563907515182186, "rewards/rejected": -4.144736842105263, "step": 180 }, { "epoch": 0.1241001028453891, "grad_norm": 0.16861594145769412, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128941022.96774194, "logits/rejected": 130658924.60606061, "logps/chosen": -221.16129032258064, "logps/rejected": -399.030303030303, "loss": 0.2341, "rewards/chosen": 0.5690524193548387, "rewards/margins": 4.65996151026393, "rewards/rejected": -4.090909090909091, "step": 181 }, { "epoch": 0.1247857387727117, "grad_norm": 0.1619275022985661, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114911593.41176471, "logits/rejected": 81089877.33333333, "logps/chosen": -245.64705882352942, "logps/rejected": -317.06666666666666, "loss": 0.2427, "rewards/chosen": 0.9126838235294118, "rewards/margins": 5.329350490196079, "rewards/rejected": -4.416666666666667, "step": 182 }, { "epoch": 0.12547137470003428, "grad_norm": 0.1430778131922557, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145402538.66666666, "logits/rejected": 131613200.51612903, "logps/chosen": -336.72727272727275, "logps/rejected": -314.5806451612903, "loss": 0.2372, "rewards/chosen": 1.0776515151515151, "rewards/margins": 4.839748289345064, "rewards/rejected": -3.7620967741935485, "step": 183 }, { "epoch": 0.12615701062735687, "grad_norm": 0.20991713756319857, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119573208.94915254, "logits/rejected": 110753940.4057971, "logps/chosen": -234.57627118644066, "logps/rejected": -298.6666666666667, "loss": 0.2376, "rewards/chosen": 0.7404661016949152, "rewards/margins": 5.0375675509702775, "rewards/rejected": -4.297101449275362, "step": 184 }, { "epoch": 0.12684264655467947, "grad_norm": 0.1997647128913248, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134119424.0, "logits/rejected": 92471296.0, "logps/chosen": -243.0, "logps/rejected": -367.5, "loss": 0.2771, "rewards/chosen": 0.237762451171875, "rewards/margins": 4.698699951171875, "rewards/rejected": -4.4609375, "step": 185 }, { "epoch": 0.12752828248200207, "grad_norm": 0.12893293653177662, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143864627.2, "logits/rejected": 93492886.58823529, "logps/chosen": -298.6666666666667, "logps/rejected": -319.05882352941177, "loss": 0.1611, "rewards/chosen": 1.5895833333333333, "rewards/margins": 5.486642156862745, "rewards/rejected": -3.8970588235294117, "step": 186 }, { "epoch": 0.12821391840932464, "grad_norm": 0.17415855722417317, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111540050.44067797, "logits/rejected": 99325981.68115942, "logps/chosen": -252.74576271186442, "logps/rejected": -313.5072463768116, "loss": 0.2388, "rewards/chosen": 0.7441737288135594, "rewards/margins": 5.077507062146893, "rewards/rejected": -4.333333333333333, "step": 187 }, { "epoch": 0.12889955433664724, "grad_norm": 0.17614346037070042, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96198391.74193548, "logits/rejected": 90304636.12121212, "logps/chosen": -286.19354838709677, "logps/rejected": -365.57575757575756, "loss": 0.2524, "rewards/chosen": 0.4596774193548387, "rewards/margins": 4.80058651026393, "rewards/rejected": -4.340909090909091, "step": 188 }, { "epoch": 0.12958519026396983, "grad_norm": 0.18800456459117776, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130461634.86567163, "logits/rejected": 81823307.5409836, "logps/chosen": -277.13432835820896, "logps/rejected": -302.95081967213116, "loss": 0.2381, "rewards/chosen": 1.0429104477611941, "rewards/margins": 5.477336677269391, "rewards/rejected": -4.434426229508197, "step": 189 }, { "epoch": 0.13027082619129243, "grad_norm": 0.1827797686202193, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83716457.41176471, "logits/rejected": 122194056.53333333, "logps/chosen": -251.52941176470588, "logps/rejected": -356.53333333333336, "loss": 0.2484, "rewards/chosen": 1.1957720588235294, "rewards/margins": 4.229105392156862, "rewards/rejected": -3.033333333333333, "step": 190 }, { "epoch": 0.130956462118615, "grad_norm": 0.17808660397311313, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 184283070.98412699, "logits/rejected": 35232153.6, "logps/chosen": -239.61904761904762, "logps/rejected": -293.66153846153844, "loss": 0.249, "rewards/chosen": 0.9608134920634921, "rewards/margins": 4.572351953601954, "rewards/rejected": -3.6115384615384616, "step": 191 }, { "epoch": 0.1316420980459376, "grad_norm": 0.17010429582831213, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 61002450.823529415, "logits/rejected": 205101465.6, "logps/chosen": -237.88235294117646, "logps/rejected": -531.2, "loss": 0.2212, "rewards/chosen": 0.8033088235294118, "rewards/margins": 5.719975490196079, "rewards/rejected": -4.916666666666667, "step": 192 }, { "epoch": 0.1323277339732602, "grad_norm": 0.21864840944816583, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97612893.0909091, "logits/rejected": 87674483.61290322, "logps/chosen": -191.63636363636363, "logps/rejected": -298.3225806451613, "loss": 0.25, "rewards/chosen": 0.7523674242424242, "rewards/margins": -641627.8927938661, "rewards/rejected": 641628.6451612903, "step": 193 }, { "epoch": 0.1330133699005828, "grad_norm": 0.22839514785067022, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75549253.5308642, "logits/rejected": 177856337.70212767, "logps/chosen": -197.53086419753086, "logps/rejected": -384.0, "loss": 0.2763, "rewards/chosen": 0.7430555555555556, "rewards/margins": 3.322842789598109, "rewards/rejected": -2.5797872340425534, "step": 194 }, { "epoch": 0.1336990058279054, "grad_norm": 0.20248252613554535, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144493772.8, "logits/rejected": 158304135.52941176, "logps/chosen": -252.0, "logps/rejected": -406.11764705882354, "loss": 0.2415, "rewards/chosen": 0.720703125, "rewards/margins": 5.691291360294118, "rewards/rejected": -4.970588235294118, "step": 195 }, { "epoch": 0.13438464175522796, "grad_norm": 0.24650310051002233, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173801472.0, "logits/rejected": 55410688.0, "logps/chosen": -264.0, "logps/rejected": -317.0, "loss": 0.2473, "rewards/chosen": 0.77587890625, "rewards/margins": 4.25634765625, "rewards/rejected": -3.48046875, "step": 196 }, { "epoch": 0.13507027768255056, "grad_norm": 0.24871397646668147, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108922848.4923077, "logits/rejected": 108386141.46031746, "logps/chosen": -199.87692307692308, "logps/rejected": -328.63492063492066, "loss": 0.2496, "rewards/chosen": 0.6605769230769231, "rewards/margins": 3.8391483516483516, "rewards/rejected": -3.1785714285714284, "step": 197 }, { "epoch": 0.13575591360987316, "grad_norm": 0.2584477246137684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120586240.0, "logits/rejected": 131989504.0, "logps/chosen": -285.25, "logps/rejected": -365.5, "loss": 0.2573, "rewards/chosen": 0.476806640625, "rewards/margins": 4.730712890625, "rewards/rejected": -4.25390625, "step": 198 }, { "epoch": 0.13644154953719576, "grad_norm": 0.24636185713537204, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155488841.14285713, "logits/rejected": 79691776.0, "logps/chosen": -207.57142857142858, "logps/rejected": -337.3333333333333, "loss": 0.235, "rewards/chosen": 0.38992745535714285, "rewards/margins": 4.563538566468254, "rewards/rejected": -4.173611111111111, "step": 199 }, { "epoch": 0.13712718546451835, "grad_norm": 0.24201946992892345, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96166859.9322034, "logits/rejected": 127288008.3478261, "logps/chosen": -240.0, "logps/rejected": -321.8550724637681, "loss": 0.2605, "rewards/chosen": 0.4592161016949153, "rewards/margins": 4.136752333578974, "rewards/rejected": -3.677536231884058, "step": 200 }, { "epoch": 0.13781282139184092, "grad_norm": 0.27384185566029406, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 171473016.47058824, "logits/rejected": 72841079.46666667, "logps/chosen": -237.1764705882353, "logps/rejected": -282.6666666666667, "loss": 0.2526, "rewards/chosen": 0.4889705882352941, "rewards/margins": 4.818137254901961, "rewards/rejected": -4.329166666666667, "step": 201 }, { "epoch": 0.13849845731916352, "grad_norm": 0.219082953422919, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155682695.52941176, "logits/rejected": 91226112.0, "logps/chosen": -281.1764705882353, "logps/rejected": -342.93333333333334, "loss": 0.2362, "rewards/chosen": 0.7169117647058824, "rewards/margins": 5.2710784313725485, "rewards/rejected": -4.554166666666666, "step": 202 }, { "epoch": 0.13918409324648612, "grad_norm": 0.1873846826438938, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92134877.86666666, "logits/rejected": 163824579.7647059, "logps/chosen": -215.46666666666667, "logps/rejected": -416.0, "loss": 0.2242, "rewards/chosen": 0.53359375, "rewards/margins": 5.2688878676470585, "rewards/rejected": -4.735294117647059, "step": 203 }, { "epoch": 0.13986972917380872, "grad_norm": 0.23021371537452034, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101958595.76470588, "logits/rejected": 152532855.46666667, "logps/chosen": -252.47058823529412, "logps/rejected": -326.4, "loss": 0.2301, "rewards/chosen": 1.1075367647058822, "rewards/margins": 5.4117034313725485, "rewards/rejected": -4.304166666666666, "step": 204 }, { "epoch": 0.14055536510113129, "grad_norm": 0.24003519687236868, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106868567.67123288, "logits/rejected": 109204424.14545454, "logps/chosen": -208.43835616438355, "logps/rejected": -355.4909090909091, "loss": 0.2306, "rewards/chosen": 1.1284246575342465, "rewards/margins": 3.5102428393524283, "rewards/rejected": -2.381818181818182, "step": 205 }, { "epoch": 0.14124100102845388, "grad_norm": 0.3526447015282055, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121355195.73333333, "logits/rejected": 99768922.35294117, "logps/chosen": -288.26666666666665, "logps/rejected": -352.94117647058823, "loss": 0.2113, "rewards/chosen": 0.6015625, "rewards/margins": 5.270680147058823, "rewards/rejected": -4.669117647058823, "step": 206 }, { "epoch": 0.14192663695577648, "grad_norm": 0.25917716983552896, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88174286.3283582, "logits/rejected": 131329846.55737706, "logps/chosen": -270.089552238806, "logps/rejected": -334.6885245901639, "loss": 0.2353, "rewards/chosen": 0.9846082089552238, "rewards/margins": 5.226411487643748, "rewards/rejected": -4.241803278688525, "step": 207 }, { "epoch": 0.14261227288309908, "grad_norm": 0.23789043570262422, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153585543.52941176, "logits/rejected": 73295462.4, "logps/chosen": -296.47058823529414, "logps/rejected": -302.1333333333333, "loss": 0.2432, "rewards/chosen": 0.7123161764705882, "rewards/margins": 5.516482843137254, "rewards/rejected": -4.804166666666666, "step": 208 }, { "epoch": 0.14329790881042168, "grad_norm": 0.3674097705563513, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157659865.42465752, "logits/rejected": 52810100.36363637, "logps/chosen": -271.3424657534247, "logps/rejected": -315.05454545454546, "loss": 0.2552, "rewards/chosen": 1.1866438356164384, "rewards/margins": 3.5207347447073474, "rewards/rejected": -2.334090909090909, "step": 209 }, { "epoch": 0.14398354473774425, "grad_norm": 0.2585452923155214, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114560840.59701492, "logits/rejected": 97156582.81967214, "logps/chosen": -207.76119402985074, "logps/rejected": -317.6393442622951, "loss": 0.2293, "rewards/chosen": 0.7014925373134329, "rewards/margins": 5.2998531930511374, "rewards/rejected": -4.598360655737705, "step": 210 }, { "epoch": 0.14466918066506684, "grad_norm": 0.2529800432208966, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120551287.46666667, "logits/rejected": 123115158.58823529, "logps/chosen": -261.6, "logps/rejected": -366.11764705882354, "loss": 0.2156, "rewards/chosen": 0.7869791666666667, "rewards/margins": 5.735508578431372, "rewards/rejected": -4.948529411764706, "step": 211 }, { "epoch": 0.14535481659238944, "grad_norm": 0.4338678291426242, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 169193957.9661017, "logits/rejected": 75619046.0289855, "logps/chosen": -256.271186440678, "logps/rejected": -354.3188405797101, "loss": 0.2168, "rewards/chosen": 0.5932203389830508, "rewards/margins": 4.325104396954066, "rewards/rejected": -3.7318840579710146, "step": 212 }, { "epoch": 0.14604045251971204, "grad_norm": 0.29794375758339353, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 176033667.87878788, "logits/rejected": 111191337.29032259, "logps/chosen": -294.54545454545456, "logps/rejected": -388.64516129032256, "loss": 0.222, "rewards/chosen": 0.8143939393939394, "rewards/margins": 5.83455522971652, "rewards/rejected": -5.020161290322581, "step": 213 }, { "epoch": 0.14672608844703464, "grad_norm": 0.3446496158972345, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154829736.22857141, "logits/rejected": 93937946.48275863, "logps/chosen": -254.62857142857143, "logps/rejected": -307.3103448275862, "loss": 0.2382, "rewards/chosen": 0.9017857142857143, "rewards/margins": 5.656096059113301, "rewards/rejected": -4.754310344827586, "step": 214 }, { "epoch": 0.1474117243743572, "grad_norm": 0.2988624716317413, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109939811.09677419, "logits/rejected": 83123479.27272727, "logps/chosen": -259.61290322580646, "logps/rejected": -253.0909090909091, "loss": 0.2275, "rewards/chosen": 0.6902091733870968, "rewards/margins": 4.372027355205279, "rewards/rejected": -3.6818181818181817, "step": 215 }, { "epoch": 0.1480973603016798, "grad_norm": 0.2255457781399757, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160060052.6451613, "logits/rejected": 67426614.3030303, "logps/chosen": -274.5806451612903, "logps/rejected": -282.6666666666667, "loss": 0.2154, "rewards/chosen": 0.4770665322580645, "rewards/margins": 4.814187744379277, "rewards/rejected": -4.337121212121212, "step": 216 }, { "epoch": 0.1487829962290024, "grad_norm": 0.33384772908027965, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150662062.73015872, "logits/rejected": 28755180.307692308, "logps/chosen": -228.06349206349208, "logps/rejected": -301.7846153846154, "loss": 0.2286, "rewards/chosen": 0.8611111111111112, "rewards/margins": -22295027.323504273, "rewards/rejected": 22295028.184615385, "step": 217 }, { "epoch": 0.149468632156325, "grad_norm": 0.4377711440094187, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131847034.43478261, "logits/rejected": 162369327.72881356, "logps/chosen": -222.14492753623188, "logps/rejected": -418.7118644067797, "loss": 0.2308, "rewards/chosen": 0.7640398550724637, "rewards/margins": 4.4547178211741585, "rewards/rejected": -3.690677966101695, "step": 218 }, { "epoch": 0.1501542680836476, "grad_norm": 0.39891917114081527, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143469869.17647058, "logits/rejected": 102900258.13333334, "logps/chosen": -215.76470588235293, "logps/rejected": -317.3333333333333, "loss": 0.255, "rewards/chosen": 0.5355009191176471, "rewards/margins": 5.0021675857843135, "rewards/rejected": -4.466666666666667, "step": 219 }, { "epoch": 0.15083990401097017, "grad_norm": 0.20163817459280645, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96468992.0, "logits/rejected": 106954752.0, "logps/chosen": -236.11428571428573, "logps/rejected": -317.51724137931035, "loss": 0.2354, "rewards/chosen": 0.9669642857142857, "rewards/margins": 5.316102216748769, "rewards/rejected": -4.349137931034483, "step": 220 }, { "epoch": 0.15152553993829276, "grad_norm": 0.20714190988604989, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77318682.94736843, "logits/rejected": 168901395.69230768, "logps/chosen": -174.94736842105263, "logps/rejected": -367.0769230769231, "loss": 0.2449, "rewards/chosen": 0.8355263157894737, "rewards/margins": 6.056680161943319, "rewards/rejected": -5.221153846153846, "step": 221 }, { "epoch": 0.15221117586561536, "grad_norm": 0.2743646745915024, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104857600.0, "logits/rejected": 100705577.29032259, "logps/chosen": -214.3030303030303, "logps/rejected": -320.7741935483871, "loss": 0.2224, "rewards/chosen": 0.9195075757575758, "rewards/margins": 5.068701124144673, "rewards/rejected": -4.149193548387097, "step": 222 }, { "epoch": 0.15289681179293796, "grad_norm": 0.37335271164596057, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86313605.26027398, "logits/rejected": 175855727.70909092, "logps/chosen": -216.54794520547946, "logps/rejected": -402.3272727272727, "loss": 0.2532, "rewards/chosen": 0.5629280821917808, "rewards/margins": 5.981109900373599, "rewards/rejected": -5.418181818181818, "step": 223 }, { "epoch": 0.15358244772026053, "grad_norm": 0.28914445063880495, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 165508567.36507937, "logits/rejected": 62946823.87692308, "logps/chosen": -242.28571428571428, "logps/rejected": -295.87692307692305, "loss": 0.2537, "rewards/chosen": 0.48387896825396826, "rewards/margins": 4.499263583638584, "rewards/rejected": -4.015384615384615, "step": 224 }, { "epoch": 0.15426808364758313, "grad_norm": 0.2887329739972034, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 175398167.27272728, "logits/rejected": 111465065.20547946, "logps/chosen": -266.4727272727273, "logps/rejected": -327.8904109589041, "loss": 0.1932, "rewards/chosen": 0.6670454545454545, "rewards/margins": 5.804031755915317, "rewards/rejected": -5.136986301369863, "step": 225 }, { "epoch": 0.15495371957490572, "grad_norm": 0.41492304421860055, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105168289.18518518, "logits/rejected": 152241899.24324325, "logps/chosen": -277.037037037037, "logps/rejected": -394.81081081081084, "loss": 0.1938, "rewards/chosen": 0.8301504629629629, "rewards/margins": 5.681501814314315, "rewards/rejected": -4.851351351351352, "step": 226 }, { "epoch": 0.15563935550222832, "grad_norm": 0.2253555896791388, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105507421.74647887, "logits/rejected": 107874555.50877193, "logps/chosen": -241.35211267605635, "logps/rejected": -320.0, "loss": 0.2307, "rewards/chosen": 0.7099471830985915, "rewards/margins": 5.648543674326662, "rewards/rejected": -4.93859649122807, "step": 227 }, { "epoch": 0.15632499142955092, "grad_norm": 0.19864633288387595, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147732707.55555555, "logits/rejected": 90776722.28571428, "logps/chosen": -291.55555555555554, "logps/rejected": -352.57142857142856, "loss": 0.2392, "rewards/chosen": 1.1961805555555556, "rewards/margins": -23630769.08953373, "rewards/rejected": 23630770.285714287, "step": 228 }, { "epoch": 0.1570106273568735, "grad_norm": 0.20395986326417767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138556663.1724138, "logits/rejected": 80410799.54285714, "logps/chosen": -224.27586206896552, "logps/rejected": -314.9714285714286, "loss": 0.2162, "rewards/chosen": 0.6783405172413793, "rewards/margins": 4.042626231527094, "rewards/rejected": -3.3642857142857143, "step": 229 }, { "epoch": 0.1576962632841961, "grad_norm": 0.24204601978093462, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157659865.42465752, "logits/rejected": 86021362.03636363, "logps/chosen": -236.27397260273972, "logps/rejected": -320.2909090909091, "loss": 0.2368, "rewards/chosen": 0.872431506849315, "rewards/margins": 6.163340597758406, "rewards/rejected": -5.290909090909091, "step": 230 }, { "epoch": 0.15838189921151868, "grad_norm": 0.273474840087959, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87106706.28571428, "logits/rejected": 180471580.44444445, "logps/chosen": -193.85714285714286, "logps/rejected": -378.22222222222223, "loss": 0.2047, "rewards/chosen": 0.6216517857142857, "rewards/margins": 5.468874007936508, "rewards/rejected": -4.847222222222222, "step": 231 }, { "epoch": 0.15906753513884128, "grad_norm": 0.2690790911529179, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133056228.43076923, "logits/rejected": 143804708.57142857, "logps/chosen": -238.27692307692308, "logps/rejected": -380.95238095238096, "loss": 0.2562, "rewards/chosen": 0.2553410456730769, "rewards/margins": 4.104547394879426, "rewards/rejected": -3.8492063492063493, "step": 232 }, { "epoch": 0.15975317106616388, "grad_norm": 0.22818905581413249, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125899025.06666666, "logits/rejected": 114171422.11764705, "logps/chosen": -266.6666666666667, "logps/rejected": -335.52941176470586, "loss": 0.2222, "rewards/chosen": 0.721875, "rewards/margins": 5.696139705882353, "rewards/rejected": -4.974264705882353, "step": 233 }, { "epoch": 0.16043880699348645, "grad_norm": 0.22509844634221948, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120820995.82089552, "logits/rejected": 146456844.59016395, "logps/chosen": -269.3731343283582, "logps/rejected": -397.6393442622951, "loss": 0.2288, "rewards/chosen": 0.8446828358208955, "rewards/margins": 5.709436934181552, "rewards/rejected": -4.864754098360656, "step": 234 }, { "epoch": 0.16112444292080905, "grad_norm": 0.24080771712090562, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125203104.47761194, "logits/rejected": 117234234.75409836, "logps/chosen": -244.0597014925373, "logps/rejected": -354.0983606557377, "loss": 0.2339, "rewards/chosen": 0.519589552238806, "rewards/margins": 6.249097748960117, "rewards/rejected": -5.729508196721311, "step": 235 }, { "epoch": 0.16181007884813164, "grad_norm": 0.3866986184351914, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113809621.97014925, "logits/rejected": 155945597.90163934, "logps/chosen": -217.55223880597015, "logps/rejected": -361.7049180327869, "loss": 0.2388, "rewards/chosen": 0.47388059701492535, "rewards/margins": 4.531257646195254, "rewards/rejected": -4.057377049180328, "step": 236 }, { "epoch": 0.16249571477545424, "grad_norm": 0.17981813494650173, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103592651.17460318, "logits/rejected": 126345342.03076923, "logps/chosen": -239.23809523809524, "logps/rejected": -347.32307692307694, "loss": 0.2223, "rewards/chosen": 0.6609623015873016, "rewards/margins": 5.518654609279609, "rewards/rejected": -4.857692307692307, "step": 237 }, { "epoch": 0.1631813507027768, "grad_norm": 0.1717283729892488, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120900812.8, "logits/rejected": 119105897.41176471, "logps/chosen": -217.6, "logps/rejected": -338.3529411764706, "loss": 0.1927, "rewards/chosen": 0.9729166666666667, "rewards/margins": 6.171446078431372, "rewards/rejected": -5.198529411764706, "step": 238 }, { "epoch": 0.1638669866300994, "grad_norm": 0.3617438979344363, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83187029.33333333, "logits/rejected": 104574201.08108108, "logps/chosen": -196.44444444444446, "logps/rejected": -282.3783783783784, "loss": 0.198, "rewards/chosen": 0.6238425925925926, "rewards/margins": 5.326545295295295, "rewards/rejected": -4.702702702702703, "step": 239 }, { "epoch": 0.164552622557422, "grad_norm": 0.35943819351966716, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161587338.84745762, "logits/rejected": 83460570.89855072, "logps/chosen": -247.59322033898306, "logps/rejected": -340.8695652173913, "loss": 0.2143, "rewards/chosen": 0.8336864406779662, "rewards/margins": 5.420642962417097, "rewards/rejected": -4.586956521739131, "step": 240 }, { "epoch": 0.1652382584847446, "grad_norm": 0.19331763915061576, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140030886.17543858, "logits/rejected": 97473261.97183098, "logps/chosen": -256.280701754386, "logps/rejected": -292.056338028169, "loss": 0.1886, "rewards/chosen": 1.1754385964912282, "rewards/margins": 4.872621695082778, "rewards/rejected": -3.6971830985915495, "step": 241 }, { "epoch": 0.1659238944120672, "grad_norm": 0.2536074850341468, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173200082.82352942, "logits/rejected": 88010478.93333334, "logps/chosen": -267.05882352941177, "logps/rejected": -370.93333333333334, "loss": 0.2136, "rewards/chosen": 1.0404411764705883, "rewards/margins": 7331437.840441177, "rewards/rejected": -7331436.8, "step": 242 }, { "epoch": 0.16660953033938977, "grad_norm": 0.20566907669089488, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 176013599.4385965, "logits/rejected": 94106003.83098592, "logps/chosen": -319.43859649122805, "logps/rejected": -361.9154929577465, "loss": 0.1885, "rewards/chosen": 1.0592105263157894, "rewards/margins": 6.228224610822831, "rewards/rejected": -5.169014084507042, "step": 243 }, { "epoch": 0.16729516626671237, "grad_norm": 0.2562710982291003, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133494572.13793103, "logits/rejected": 96498951.31428571, "logps/chosen": -263.7241379310345, "logps/rejected": -349.25714285714287, "loss": 0.1962, "rewards/chosen": 1.2435344827586208, "rewards/margins": 6.486391625615764, "rewards/rejected": -5.242857142857143, "step": 244 }, { "epoch": 0.16798080219403497, "grad_norm": 0.26453555920550886, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 193583261.53846154, "logits/rejected": 32222906.92063492, "logps/chosen": -254.27692307692308, "logps/rejected": -292.8253968253968, "loss": 0.2473, "rewards/chosen": 0.5624399038461538, "rewards/margins": 5.014820856227106, "rewards/rejected": -4.4523809523809526, "step": 245 }, { "epoch": 0.16866643812135756, "grad_norm": 0.24805743526825003, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131873852.23529412, "logits/rejected": 118768708.26666667, "logps/chosen": -260.70588235294116, "logps/rejected": -394.1333333333333, "loss": 0.2324, "rewards/chosen": 0.7371323529411765, "rewards/margins": 5.237132352941177, "rewards/rejected": -4.5, "step": 246 }, { "epoch": 0.16935207404868016, "grad_norm": 0.34119361395954656, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131820982.85714285, "logits/rejected": 146155362.46153846, "logps/chosen": -263.1111111111111, "logps/rejected": -350.2769230769231, "loss": 0.2292, "rewards/chosen": 0.6984126984126984, "rewards/margins": 4.852258852258853, "rewards/rejected": -4.153846153846154, "step": 247 }, { "epoch": 0.17003770997600273, "grad_norm": 0.23980277327219165, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166594811.50877193, "logits/rejected": 117913109.63380282, "logps/chosen": -217.68421052631578, "logps/rejected": -335.32394366197184, "loss": 0.1886, "rewards/chosen": 0.5241228070175439, "rewards/margins": 5.981869285890783, "rewards/rejected": -5.457746478873239, "step": 248 }, { "epoch": 0.17072334590332533, "grad_norm": 0.2535583180868984, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128868470.72463769, "logits/rejected": 64691807.45762712, "logps/chosen": -271.768115942029, "logps/rejected": -345.49152542372883, "loss": 0.2178, "rewards/chosen": 1.141304347826087, "rewards/margins": 6.115880619012527, "rewards/rejected": -4.97457627118644, "step": 249 }, { "epoch": 0.17140898183064793, "grad_norm": 0.21231813802341107, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 177724745.76271185, "logits/rejected": 67108864.0, "logps/chosen": -273.89830508474574, "logps/rejected": -308.8695652173913, "loss": 0.2202, "rewards/chosen": 0.9645127118644068, "rewards/margins": 5.533353291574551, "rewards/rejected": -4.568840579710145, "step": 250 }, { "epoch": 0.17209461775797052, "grad_norm": 0.24018651077415026, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112764894.4262295, "logits/rejected": 100937177.79104477, "logps/chosen": -224.78688524590163, "logps/rejected": -362.02985074626866, "loss": 0.2203, "rewards/chosen": 0.8299180327868853, "rewards/margins": 3.51648519696599, "rewards/rejected": -2.6865671641791047, "step": 251 }, { "epoch": 0.17278025368529312, "grad_norm": 0.2592164476829281, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134742016.0, "logits/rejected": 114819072.0, "logps/chosen": -259.7142857142857, "logps/rejected": -332.8888888888889, "loss": 0.22, "rewards/chosen": 0.6252790178571429, "rewards/margins": 4.882223462301588, "rewards/rejected": -4.256944444444445, "step": 252 }, { "epoch": 0.1734658896126157, "grad_norm": 0.5184454162725811, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100902970.51428571, "logits/rejected": 131903629.2413793, "logps/chosen": -242.5142857142857, "logps/rejected": -406.8965517241379, "loss": 0.2418, "rewards/chosen": 0.43839285714285714, "rewards/margins": 5.834944581280788, "rewards/rejected": -5.396551724137931, "step": 253 }, { "epoch": 0.1741515255399383, "grad_norm": 0.2540954987166149, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119021970.8852459, "logits/rejected": 73275116.89552239, "logps/chosen": -198.95081967213116, "logps/rejected": -342.92537313432837, "loss": 0.1893, "rewards/chosen": 0.9559426229508197, "rewards/margins": 6.157435160264253, "rewards/rejected": -5.201492537313433, "step": 254 }, { "epoch": 0.1748371614672609, "grad_norm": 0.25491094938887954, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128478154.10526316, "logits/rejected": 64096054.08450704, "logps/chosen": -227.08771929824562, "logps/rejected": -316.61971830985914, "loss": 0.2209, "rewards/chosen": 0.6466557017543859, "rewards/margins": 4.22412049048678, "rewards/rejected": -3.5774647887323945, "step": 255 }, { "epoch": 0.17552279739458349, "grad_norm": 0.2195313990782611, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127837659.94366197, "logits/rejected": 112363196.63157895, "logps/chosen": -292.7323943661972, "logps/rejected": -320.8421052631579, "loss": 0.2151, "rewards/chosen": 1.4577464788732395, "rewards/margins": 5.01914998764517, "rewards/rejected": -3.56140350877193, "step": 256 }, { "epoch": 0.17620843332190606, "grad_norm": 0.21055225244776513, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94405128.12698413, "logits/rejected": 105051183.26153846, "logps/chosen": -209.015873015873, "logps/rejected": -352.4923076923077, "loss": 0.1902, "rewards/chosen": 0.9248511904761905, "rewards/margins": 6.447928113553114, "rewards/rejected": -5.523076923076923, "step": 257 }, { "epoch": 0.17689406924922865, "grad_norm": 0.32367826803911537, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109524501.63380282, "logits/rejected": 122738580.21052632, "logps/chosen": -219.04225352112675, "logps/rejected": -396.63157894736844, "loss": 0.2447, "rewards/chosen": 0.6879401408450704, "rewards/margins": 5.845834877687175, "rewards/rejected": -5.157894736842105, "step": 258 }, { "epoch": 0.17757970517655125, "grad_norm": 0.20034262738728156, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 187478731.17460316, "logits/rejected": 85692857.1076923, "logps/chosen": -297.9047619047619, "logps/rejected": -285.53846153846155, "loss": 0.2249, "rewards/chosen": 0.5639880952380952, "rewards/margins": 5.594757326007326, "rewards/rejected": -5.030769230769231, "step": 259 }, { "epoch": 0.17826534110387385, "grad_norm": 0.23721388424987722, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99934208.0, "logits/rejected": 140509184.0, "logps/chosen": -233.0, "logps/rejected": -365.5, "loss": 0.1939, "rewards/chosen": 1.177734375, "rewards/margins": 5.005859375, "rewards/rejected": -3.828125, "step": 260 }, { "epoch": 0.17895097703119645, "grad_norm": 0.2385795545746065, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112999484.23529412, "logits/rejected": 62460177.06666667, "logps/chosen": -213.88235294117646, "logps/rejected": -338.93333333333334, "loss": 0.2492, "rewards/chosen": 0.5235523897058824, "rewards/margins": 5.831885723039216, "rewards/rejected": -5.308333333333334, "step": 261 }, { "epoch": 0.17963661295851902, "grad_norm": 0.6644641467658682, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126540018.98305085, "logits/rejected": 98474963.47826087, "logps/chosen": -210.71186440677965, "logps/rejected": -306.5507246376812, "loss": 0.2234, "rewards/chosen": 0.7097457627118644, "rewards/margins": 5.5322095308278065, "rewards/rejected": -4.822463768115942, "step": 262 }, { "epoch": 0.1803222488858416, "grad_norm": 0.23302491149935817, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154140672.0, "logits/rejected": 92667904.0, "logps/chosen": -292.5, "logps/rejected": -320.0, "loss": 0.2356, "rewards/chosen": 0.68798828125, "rewards/margins": 4.63330078125, "rewards/rejected": -3.9453125, "step": 263 }, { "epoch": 0.1810078848131642, "grad_norm": 0.31227021080926115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149344726.03278688, "logits/rejected": 124577088.95522387, "logps/chosen": -279.73770491803276, "logps/rejected": -338.86567164179104, "loss": 0.251, "rewards/chosen": 0.44415983606557374, "rewards/margins": 5.373264313677514, "rewards/rejected": -4.92910447761194, "step": 264 }, { "epoch": 0.1816935207404868, "grad_norm": 0.21257714248384144, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118457312.96969697, "logits/rejected": 131985275.87096775, "logps/chosen": -239.03030303030303, "logps/rejected": -391.2258064516129, "loss": 0.2228, "rewards/chosen": 0.9535984848484849, "rewards/margins": 6.820533968719452, "rewards/rejected": -5.866935483870968, "step": 265 }, { "epoch": 0.1823791566678094, "grad_norm": 0.17464540892188396, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122546621.2173913, "logits/rejected": 131160862.37288135, "logps/chosen": -231.42028985507247, "logps/rejected": -375.0508474576271, "loss": 0.2321, "rewards/chosen": 0.8093297101449275, "rewards/margins": 4.415261913534758, "rewards/rejected": -3.6059322033898304, "step": 266 }, { "epoch": 0.18306479259513198, "grad_norm": 0.3201645749069392, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173412775.72413793, "logits/rejected": 77235112.22857143, "logps/chosen": -278.0689655172414, "logps/rejected": -294.62857142857143, "loss": 0.2195, "rewards/chosen": 0.6179956896551724, "rewards/margins": 4.8608528325123155, "rewards/rejected": -4.242857142857143, "step": 267 }, { "epoch": 0.18375042852245457, "grad_norm": 0.22585431266868222, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152859079.1111111, "logits/rejected": 116730706.70769231, "logps/chosen": -233.65079365079364, "logps/rejected": -385.7230769230769, "loss": 0.1915, "rewards/chosen": 1.2242063492063493, "rewards/margins": 6.601129426129427, "rewards/rejected": -5.376923076923077, "step": 268 }, { "epoch": 0.18443606444977717, "grad_norm": 0.3425616363342169, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98957138.44067797, "logits/rejected": 140053281.39130434, "logps/chosen": -256.0, "logps/rejected": -304.69565217391306, "loss": 0.2156, "rewards/chosen": 0.8760593220338984, "rewards/margins": 5.434030336526652, "rewards/rejected": -4.557971014492754, "step": 269 }, { "epoch": 0.18512170037709977, "grad_norm": 0.33416685388017753, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140035633.5483871, "logits/rejected": 136505530.1818182, "logps/chosen": -226.83870967741936, "logps/rejected": -356.3636363636364, "loss": 0.2247, "rewards/chosen": 0.7338709677419355, "rewards/margins": 5.65811339198436, "rewards/rejected": -4.924242424242424, "step": 270 }, { "epoch": 0.18580733630442234, "grad_norm": 0.2961689537632258, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93631668.70588236, "logits/rejected": 111463628.8, "logps/chosen": -202.8235294117647, "logps/rejected": -325.8666666666667, "loss": 0.231, "rewards/chosen": 1.1378676470588236, "rewards/margins": 6.187867647058823, "rewards/rejected": -5.05, "step": 271 }, { "epoch": 0.18649297223174494, "grad_norm": 0.25817670898955425, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122833188.57142857, "logits/rejected": 95330021.51724137, "logps/chosen": -263.42857142857144, "logps/rejected": -301.9310344827586, "loss": 0.2189, "rewards/chosen": 1.0459821428571427, "rewards/margins": 6.2959821428571425, "rewards/rejected": -5.25, "step": 272 }, { "epoch": 0.18717860815906753, "grad_norm": 0.6385580901019109, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124164713.65079366, "logits/rejected": 139121837.2923077, "logps/chosen": -325.58730158730157, "logps/rejected": -381.04615384615386, "loss": 0.2021, "rewards/chosen": 1.1845238095238095, "rewards/margins": 6.342216117216117, "rewards/rejected": -5.157692307692308, "step": 273 }, { "epoch": 0.18786424408639013, "grad_norm": 0.30430352952208356, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135311894.26086956, "logits/rejected": 97641975.3220339, "logps/chosen": -304.92753623188406, "logps/rejected": -307.79661016949154, "loss": 0.2508, "rewards/chosen": 0.8306159420289855, "rewards/margins": 5.453497297961189, "rewards/rejected": -4.622881355932203, "step": 274 }, { "epoch": 0.18854988001371273, "grad_norm": 0.2286369815878566, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87930587.42857143, "logits/rejected": 138761557.33333334, "logps/chosen": -215.42857142857142, "logps/rejected": -312.0, "loss": 0.2156, "rewards/chosen": 0.5926339285714286, "rewards/margins": 5.2558283730158735, "rewards/rejected": -4.663194444444445, "step": 275 }, { "epoch": 0.1892355159410353, "grad_norm": 0.24350823329818128, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139781006.2222222, "logits/rejected": 82058156.97297297, "logps/chosen": -268.22222222222223, "logps/rejected": -341.18918918918916, "loss": 0.2053, "rewards/chosen": 0.9560185185185185, "rewards/margins": 5.692505005005005, "rewards/rejected": -4.736486486486487, "step": 276 }, { "epoch": 0.1899211518683579, "grad_norm": 0.5966300371998317, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139949943.46666667, "logits/rejected": 87903051.29411764, "logps/chosen": -210.93333333333334, "logps/rejected": -340.94117647058823, "loss": 0.2094, "rewards/chosen": 0.9421875, "rewards/margins": 5.934834558823529, "rewards/rejected": -4.992647058823529, "step": 277 }, { "epoch": 0.1906067877956805, "grad_norm": 0.19971198525271794, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109496754.42424242, "logits/rejected": 133029623.74193548, "logps/chosen": -226.9090909090909, "logps/rejected": -317.4193548387097, "loss": 0.2132, "rewards/chosen": 0.8494318181818182, "rewards/margins": 5.655883431085044, "rewards/rejected": -4.806451612903226, "step": 278 }, { "epoch": 0.1912924237230031, "grad_norm": 0.27764293542842894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99255996.63157895, "logits/rejected": 114052804.92307693, "logps/chosen": -224.6315789473684, "logps/rejected": -353.2307692307692, "loss": 0.2355, "rewards/chosen": 1.0476973684210527, "rewards/margins": 6.326543522267206, "rewards/rejected": -5.278846153846154, "step": 279 }, { "epoch": 0.1919780596503257, "grad_norm": 0.30423243704104413, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114359311.75384615, "logits/rejected": 128492170.15873016, "logps/chosen": -213.66153846153847, "logps/rejected": -341.07936507936506, "loss": 0.2238, "rewards/chosen": 0.5231971153846153, "rewards/margins": 5.92795902014652, "rewards/rejected": -5.404761904761905, "step": 280 }, { "epoch": 0.19266369557764826, "grad_norm": 0.22466607775765776, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 20717970.8852459, "logits/rejected": 168773784.83582088, "logps/chosen": -192.0655737704918, "logps/rejected": -434.6268656716418, "loss": 0.1823, "rewards/chosen": 0.8237704918032787, "rewards/margins": 5.876009297773428, "rewards/rejected": -5.052238805970149, "step": 281 }, { "epoch": 0.19334933150497086, "grad_norm": 0.2671087169852036, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137105609.44262296, "logits/rejected": 132464884.53731343, "logps/chosen": -245.24590163934425, "logps/rejected": -381.3731343283582, "loss": 0.2025, "rewards/chosen": 0.8985655737704918, "rewards/margins": 3.398565573770492, "rewards/rejected": -2.5, "step": 282 }, { "epoch": 0.19403496743229345, "grad_norm": 0.3703994475046999, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98124638.31578948, "logits/rejected": 165172873.01408452, "logps/chosen": -264.9824561403509, "logps/rejected": -399.77464788732397, "loss": 0.1991, "rewards/chosen": 0.918859649122807, "rewards/margins": 6.186465282925624, "rewards/rejected": -5.267605633802817, "step": 283 }, { "epoch": 0.19472060335961605, "grad_norm": 0.3694914674872744, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129255452.84507042, "logits/rejected": 131863031.01754385, "logps/chosen": -219.71830985915494, "logps/rejected": -356.49122807017545, "loss": 0.2195, "rewards/chosen": 0.7772887323943662, "rewards/margins": 6.24220101309612, "rewards/rejected": -5.464912280701754, "step": 284 }, { "epoch": 0.19540623928693865, "grad_norm": 0.25081733862003097, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157759950.4516129, "logits/rejected": 72383519.03030303, "logps/chosen": -291.0967741935484, "logps/rejected": -317.8181818181818, "loss": 0.2013, "rewards/chosen": 1.1940524193548387, "rewards/margins": 6.292537267839688, "rewards/rejected": -5.098484848484849, "step": 285 }, { "epoch": 0.19609187521426122, "grad_norm": 0.33641289507759964, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85334113.52380952, "logits/rejected": 142864447.0153846, "logps/chosen": -227.8095238095238, "logps/rejected": -309.16923076923075, "loss": 0.2346, "rewards/chosen": 0.8174603174603174, "rewards/margins": 5.113614163614163, "rewards/rejected": -4.296153846153846, "step": 286 }, { "epoch": 0.19677751114158382, "grad_norm": 0.3000545486910348, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128929257.73913044, "logits/rejected": 83016529.17073171, "logps/chosen": -241.91304347826087, "logps/rejected": -364.0975609756098, "loss": 0.1687, "rewards/chosen": 1.0067934782608696, "rewards/margins": 5.9519154294803815, "rewards/rejected": -4.945121951219512, "step": 287 }, { "epoch": 0.19746314706890641, "grad_norm": 0.2060848053458396, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107213304.98630136, "logits/rejected": 136200489.8909091, "logps/chosen": -207.78082191780823, "logps/rejected": -381.6727272727273, "loss": 0.22, "rewards/chosen": 0.9957191780821918, "rewards/margins": 4.41390099626401, "rewards/rejected": -3.418181818181818, "step": 288 }, { "epoch": 0.198148782996229, "grad_norm": 0.23077964496314876, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116696361.29032259, "logits/rejected": 129896323.87878788, "logps/chosen": -225.29032258064515, "logps/rejected": -370.42424242424244, "loss": 0.2226, "rewards/chosen": 0.6470199092741935, "rewards/margins": 5.3780805153347995, "rewards/rejected": -4.731060606060606, "step": 289 }, { "epoch": 0.19883441892355158, "grad_norm": 0.32475025716180284, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127679548.23529412, "logits/rejected": 109191714.13333334, "logps/chosen": -170.58823529411765, "logps/rejected": -372.8, "loss": 0.2112, "rewards/chosen": 0.7775735294117647, "rewards/margins": 5.519240196078431, "rewards/rejected": -4.741666666666666, "step": 290 }, { "epoch": 0.19952005485087418, "grad_norm": 0.2665339963023512, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146731880.9180328, "logits/rejected": 107205158.20895523, "logps/chosen": -313.7049180327869, "logps/rejected": -351.5223880597015, "loss": 0.1826, "rewards/chosen": 1.596311475409836, "rewards/margins": 6.853774161977, "rewards/rejected": -5.257462686567164, "step": 291 }, { "epoch": 0.20020569077819678, "grad_norm": 0.32057660072142724, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 169525003.46268657, "logits/rejected": 92137169.83606558, "logps/chosen": -253.37313432835822, "logps/rejected": -347.8032786885246, "loss": 0.2229, "rewards/chosen": 0.753964552238806, "rewards/margins": 6.53265307682897, "rewards/rejected": -5.778688524590164, "step": 292 }, { "epoch": 0.20089132670551937, "grad_norm": 0.19229475675923904, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134535478.3030303, "logits/rejected": 85205256.25806452, "logps/chosen": -223.27272727272728, "logps/rejected": -329.80645161290323, "loss": 0.2085, "rewards/chosen": 1.0085227272727273, "rewards/margins": 6.3230388563049855, "rewards/rejected": -5.314516129032258, "step": 293 }, { "epoch": 0.20157696263284197, "grad_norm": 0.21298365596075192, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151120147.1044776, "logits/rejected": 113039930.75409836, "logps/chosen": -236.4179104477612, "logps/rejected": -356.1967213114754, "loss": 0.2361, "rewards/chosen": 0.7737873134328358, "rewards/margins": 5.822967641301688, "rewards/rejected": -5.049180327868853, "step": 294 }, { "epoch": 0.20226259856016454, "grad_norm": 0.19603396777740625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133774099.6923077, "logits/rejected": 113025455.15789473, "logps/chosen": -266.3076923076923, "logps/rejected": -388.2105263157895, "loss": 0.1866, "rewards/chosen": 0.74609375, "rewards/margins": 6.193462171052632, "rewards/rejected": -5.447368421052632, "step": 295 }, { "epoch": 0.20294823448748714, "grad_norm": 0.4281401093273322, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142606336.0, "logits/rejected": 111635894.85714285, "logps/chosen": -256.44444444444446, "logps/rejected": -361.7142857142857, "loss": 0.2704, "rewards/chosen": 0.6861979166666666, "rewards/margins": 5.5031622023809526, "rewards/rejected": -4.816964285714286, "step": 296 }, { "epoch": 0.20363387041480974, "grad_norm": 0.24143738071385873, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 165360435.2, "logits/rejected": 57255578.41269841, "logps/chosen": -265.7230769230769, "logps/rejected": -316.95238095238096, "loss": 0.1998, "rewards/chosen": 1.3197115384615385, "rewards/margins": 6.363362332112332, "rewards/rejected": -5.0436507936507935, "step": 297 }, { "epoch": 0.20431950634213233, "grad_norm": 0.24341487017892482, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103355162.74626866, "logits/rejected": 92343447.08196722, "logps/chosen": -213.97014925373134, "logps/rejected": -298.75409836065575, "loss": 0.2242, "rewards/chosen": 0.7975746268656716, "rewards/margins": 6.125443479324688, "rewards/rejected": -5.327868852459017, "step": 298 }, { "epoch": 0.20500514226945493, "grad_norm": 0.39964000302924685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146543845.87755102, "logits/rejected": 60127206.07594936, "logps/chosen": -244.24489795918367, "logps/rejected": -339.44303797468353, "loss": 0.1871, "rewards/chosen": 0.3364158163265306, "rewards/margins": 5.621225942908809, "rewards/rejected": -5.284810126582278, "step": 299 }, { "epoch": 0.2056907781967775, "grad_norm": 0.16810646739177493, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112840307.61290322, "logits/rejected": 88588784.48484848, "logps/chosen": -257.03225806451616, "logps/rejected": -344.24242424242425, "loss": 0.2029, "rewards/chosen": 0.9939516129032258, "rewards/margins": 6.660618279569893, "rewards/rejected": -5.666666666666667, "step": 300 }, { "epoch": 0.2063764141241001, "grad_norm": 0.30142551874751583, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 198397236.82539684, "logits/rejected": 94726742.64615385, "logps/chosen": -299.93650793650795, "logps/rejected": -301.04615384615386, "loss": 0.2129, "rewards/chosen": 1.0198412698412698, "rewards/margins": 6.504456654456654, "rewards/rejected": -5.484615384615385, "step": 301 }, { "epoch": 0.2070620500514227, "grad_norm": 0.18819260424131762, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122991796.70588236, "logits/rejected": 124710638.93333334, "logps/chosen": -252.23529411764707, "logps/rejected": -342.6666666666667, "loss": 0.2256, "rewards/chosen": 0.8795955882352942, "rewards/margins": 4.758762254901961, "rewards/rejected": -3.879166666666667, "step": 302 }, { "epoch": 0.2077476859787453, "grad_norm": 0.17575313405605422, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141866164.70588234, "logits/rejected": 159872887.46666667, "logps/chosen": -241.41176470588235, "logps/rejected": -433.6, "loss": 0.1871, "rewards/chosen": 1.0229779411764706, "rewards/margins": 3.9896446078431373, "rewards/rejected": -2.966666666666667, "step": 303 }, { "epoch": 0.20843332190606786, "grad_norm": 0.2470388666678164, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73585362.8235294, "logits/rejected": 145961779.2, "logps/chosen": -180.94117647058823, "logps/rejected": -358.4, "loss": 0.2213, "rewards/chosen": 0.7394301470588235, "rewards/margins": 6.48109681372549, "rewards/rejected": -5.741666666666666, "step": 304 }, { "epoch": 0.20911895783339046, "grad_norm": 0.2054823258287357, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112005639.2112676, "logits/rejected": 127668727.01754385, "logps/chosen": -233.2394366197183, "logps/rejected": -324.49122807017545, "loss": 0.2442, "rewards/chosen": 0.8571742957746479, "rewards/margins": 2223623.594016401, "rewards/rejected": -2223622.736842105, "step": 305 }, { "epoch": 0.20980459376071306, "grad_norm": 0.23895117622666331, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93192192.0, "logits/rejected": 118095872.0, "logps/chosen": -186.875, "logps/rejected": -350.5, "loss": 0.2026, "rewards/chosen": 0.8681640625, "rewards/margins": 6.7822265625, "rewards/rejected": -5.9140625, "step": 306 }, { "epoch": 0.21049022968803566, "grad_norm": 0.23800416945774508, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96235975.1111111, "logits/rejected": 116817034.37837838, "logps/chosen": -237.92592592592592, "logps/rejected": -405.6216216216216, "loss": 0.1826, "rewards/chosen": 1.4450231481481481, "rewards/margins": 6.978806931931932, "rewards/rejected": -5.533783783783784, "step": 307 }, { "epoch": 0.21117586561535825, "grad_norm": 0.2513394365004747, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155012023.88732395, "logits/rejected": 64754167.01754386, "logps/chosen": -243.1549295774648, "logps/rejected": -285.7543859649123, "loss": 0.1996, "rewards/chosen": 1.4058098591549295, "rewards/margins": 5.5593186310847535, "rewards/rejected": -4.1535087719298245, "step": 308 }, { "epoch": 0.21186150154268082, "grad_norm": 0.2548109462202273, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138139674.5974026, "logits/rejected": 92891497.41176471, "logps/chosen": -263.68831168831167, "logps/rejected": -308.078431372549, "loss": 0.2408, "rewards/chosen": 1.161525974025974, "rewards/margins": 5.578192640692641, "rewards/rejected": -4.416666666666667, "step": 309 }, { "epoch": 0.21254713747000342, "grad_norm": 0.23027257731135656, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124952698.26865672, "logits/rejected": 86705202.36065574, "logps/chosen": -275.34328358208955, "logps/rejected": -317.9016393442623, "loss": 0.2115, "rewards/chosen": 0.9794776119402985, "rewards/margins": 5492283.208985809, "rewards/rejected": -5492282.229508197, "step": 310 }, { "epoch": 0.21323277339732602, "grad_norm": 0.22026804384291515, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115271044.4137931, "logits/rejected": 138771543.77142859, "logps/chosen": -202.3448275862069, "logps/rejected": -385.37142857142857, "loss": 0.2067, "rewards/chosen": 0.625, "rewards/margins": 4.875, "rewards/rejected": -4.25, "step": 311 }, { "epoch": 0.21391840932464862, "grad_norm": 0.206344831420334, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119704104.63492064, "logits/rejected": 79111026.21538462, "logps/chosen": -238.47619047619048, "logps/rejected": -338.2153846153846, "loss": 0.2043, "rewards/chosen": 1.0714285714285714, "rewards/margins": 6.036813186813187, "rewards/rejected": -4.9653846153846155, "step": 312 }, { "epoch": 0.21460404525197121, "grad_norm": 0.21784573868994345, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 186946121.14285713, "logits/rejected": 82611861.06329113, "logps/chosen": -288.3265306122449, "logps/rejected": -346.73417721518985, "loss": 0.1713, "rewards/chosen": 0.9853316326530612, "rewards/margins": 6.47267340480496, "rewards/rejected": -5.487341772151899, "step": 313 }, { "epoch": 0.21528968117929378, "grad_norm": 0.19589518743177703, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166889148.63157895, "logits/rejected": 103557956.50704226, "logps/chosen": -255.1578947368421, "logps/rejected": -374.3098591549296, "loss": 0.2009, "rewards/chosen": 0.4791666666666667, "rewards/margins": 5.091842723004695, "rewards/rejected": -4.612676056338028, "step": 314 }, { "epoch": 0.21597531710661638, "grad_norm": 0.18671301980271332, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75085531.42857143, "logits/rejected": 150412401.7777778, "logps/chosen": -253.14285714285714, "logps/rejected": -345.3333333333333, "loss": 0.1681, "rewards/chosen": 1.3582589285714286, "rewards/margins": 5.6985367063492065, "rewards/rejected": -4.340277777777778, "step": 315 }, { "epoch": 0.21666095303393898, "grad_norm": 0.2083842108224796, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128170939.73333333, "logits/rejected": 69067233.88235295, "logps/chosen": -280.8, "logps/rejected": -311.05882352941177, "loss": 0.203, "rewards/chosen": 1.1994791666666667, "rewards/margins": 5.73624387254902, "rewards/rejected": -4.536764705882353, "step": 316 }, { "epoch": 0.21734658896126158, "grad_norm": 0.18125851086389871, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134742016.0, "logits/rejected": 136376560.94117647, "logps/chosen": -235.2, "logps/rejected": -368.0, "loss": 0.2153, "rewards/chosen": 0.6989583333333333, "rewards/margins": 6.632781862745098, "rewards/rejected": -5.9338235294117645, "step": 317 }, { "epoch": 0.21803222488858418, "grad_norm": 0.2914619246517285, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134454026.81690142, "logits/rejected": 123180085.89473684, "logps/chosen": -251.94366197183098, "logps/rejected": -396.35087719298247, "loss": 0.229, "rewards/chosen": 1.0107834507042253, "rewards/margins": 6.423064152458611, "rewards/rejected": -5.412280701754386, "step": 318 }, { "epoch": 0.21871786081590674, "grad_norm": 0.262742878577024, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117562086.0289855, "logits/rejected": 109265173.69491525, "logps/chosen": -227.0144927536232, "logps/rejected": -296.40677966101697, "loss": 0.2184, "rewards/chosen": 1.0579710144927537, "rewards/margins": 5.227462539916483, "rewards/rejected": -4.169491525423729, "step": 319 }, { "epoch": 0.21940349674322934, "grad_norm": 0.25474956171887553, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 183024174.54545453, "logits/rejected": 96266041.80645162, "logps/chosen": -253.33333333333334, "logps/rejected": -357.6774193548387, "loss": 0.2213, "rewards/chosen": 0.9422348484848485, "rewards/margins": 6.08739613880743, "rewards/rejected": -5.145161290322581, "step": 320 }, { "epoch": 0.22008913267055194, "grad_norm": 0.22990219156640165, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142719695.56756756, "logits/rejected": 103148809.48148148, "logps/chosen": -260.1081081081081, "logps/rejected": -379.85185185185185, "loss": 0.2079, "rewards/chosen": 1.4054054054054055, "rewards/margins": 6.562812812812813, "rewards/rejected": -5.157407407407407, "step": 321 }, { "epoch": 0.22077476859787454, "grad_norm": 0.1899177129373598, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103492786.79365079, "logits/rejected": 112116972.3076923, "logps/chosen": -265.14285714285717, "logps/rejected": -351.0153846153846, "loss": 0.1972, "rewards/chosen": 0.9811507936507936, "rewards/margins": 6.681150793650794, "rewards/rejected": -5.7, "step": 322 }, { "epoch": 0.2214604045251971, "grad_norm": 0.19016823936090518, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155971236.88135594, "logits/rejected": 70414157.91304348, "logps/chosen": -307.79661016949154, "logps/rejected": -325.7971014492754, "loss": 0.21, "rewards/chosen": 1.2658898305084745, "rewards/margins": 6.49777388847949, "rewards/rejected": -5.231884057971015, "step": 323 }, { "epoch": 0.2221460404525197, "grad_norm": 0.19439760588312124, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117307359.49206349, "logits/rejected": 107438710.15384616, "logps/chosen": -240.25396825396825, "logps/rejected": -352.4923076923077, "loss": 0.1982, "rewards/chosen": 1.306547619047619, "rewards/margins": 6.283470695970696, "rewards/rejected": -4.976923076923077, "step": 324 }, { "epoch": 0.2228316763798423, "grad_norm": 0.2616763417200805, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111377836.21818182, "logits/rejected": 111924714.9589041, "logps/chosen": -251.92727272727274, "logps/rejected": -342.7945205479452, "loss": 0.1551, "rewards/chosen": 1.4619318181818182, "rewards/margins": 7.277000311332503, "rewards/rejected": -5.815068493150685, "step": 325 }, { "epoch": 0.2235173123071649, "grad_norm": 0.3256694071395796, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 67598199.46666667, "logits/rejected": 152968734.11764705, "logps/chosen": -197.33333333333334, "logps/rejected": -373.1764705882353, "loss": 0.1767, "rewards/chosen": 1.0583333333333333, "rewards/margins": 7.308333333333334, "rewards/rejected": -6.25, "step": 326 }, { "epoch": 0.2242029482344875, "grad_norm": 0.2114134169014718, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105551273.35384615, "logits/rejected": 117440512.0, "logps/chosen": -201.96923076923076, "logps/rejected": -366.22222222222223, "loss": 0.1699, "rewards/chosen": 1.398076923076923, "rewards/margins": 6.969505494505494, "rewards/rejected": -5.571428571428571, "step": 327 }, { "epoch": 0.22488858416181007, "grad_norm": 0.3629559152107846, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 56417500.8627451, "logits/rejected": 126714281.55844156, "logps/chosen": -189.01960784313727, "logps/rejected": -309.61038961038963, "loss": 0.1856, "rewards/chosen": 0.7303921568627451, "rewards/margins": 5.620002546473135, "rewards/rejected": -4.8896103896103895, "step": 328 }, { "epoch": 0.22557422008913267, "grad_norm": 0.23618606342593, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159029103.7746479, "logits/rejected": 48250592.561403506, "logps/chosen": -253.9718309859155, "logps/rejected": -294.7368421052632, "loss": 0.2249, "rewards/chosen": 0.8069982394366197, "rewards/margins": 6.885945607857672, "rewards/rejected": -6.078947368421052, "step": 329 }, { "epoch": 0.22625985601645526, "grad_norm": 0.17546830373585476, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122852517.16129032, "logits/rejected": 80772127.03030303, "logps/chosen": -238.96774193548387, "logps/rejected": -330.6666666666667, "loss": 0.1924, "rewards/chosen": 0.876008064516129, "rewards/margins": 6.141159579667645, "rewards/rejected": -5.265151515151516, "step": 330 }, { "epoch": 0.22694549194377786, "grad_norm": 0.22020689111464642, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103403123.61290322, "logits/rejected": 137903631.5151515, "logps/chosen": -227.61290322580646, "logps/rejected": -354.42424242424244, "loss": 0.2025, "rewards/chosen": 1.2268145161290323, "rewards/margins": 6.79499633431085, "rewards/rejected": -5.568181818181818, "step": 331 }, { "epoch": 0.22763112787110046, "grad_norm": 0.24883124686675095, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167514048.9846154, "logits/rejected": 113179631.74603175, "logps/chosen": -277.16923076923075, "logps/rejected": -384.5079365079365, "loss": 0.2101, "rewards/chosen": 1.0336538461538463, "rewards/margins": 6.55746336996337, "rewards/rejected": -5.523809523809524, "step": 332 }, { "epoch": 0.22831676379842303, "grad_norm": 0.20220062510378095, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 164611235.2463768, "logits/rejected": 159952271.18644068, "logps/chosen": -340.40579710144925, "logps/rejected": -437.6949152542373, "loss": 0.1903, "rewards/chosen": 1.2318840579710144, "rewards/margins": 5.7488332105133875, "rewards/rejected": -4.516949152542373, "step": 333 }, { "epoch": 0.22900239972574563, "grad_norm": 0.28907397607113033, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120925919.54929577, "logits/rejected": 123842344.42105263, "logps/chosen": -237.07042253521126, "logps/rejected": -421.6140350877193, "loss": 0.237, "rewards/chosen": 0.6346830985915493, "rewards/margins": 6.625911168766987, "rewards/rejected": -5.991228070175438, "step": 334 }, { "epoch": 0.22968803565306822, "grad_norm": 0.21618473713548791, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99442565.73134328, "logits/rejected": 151269980.32786885, "logps/chosen": -219.70149253731344, "logps/rejected": -348.327868852459, "loss": 0.1946, "rewards/chosen": 0.9085820895522388, "rewards/margins": 6.5725165157817464, "rewards/rejected": -5.663934426229508, "step": 335 }, { "epoch": 0.23037367158039082, "grad_norm": 0.19455195905511682, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161513992.12698412, "logits/rejected": 71432223.5076923, "logps/chosen": -210.28571428571428, "logps/rejected": -333.53846153846155, "loss": 0.1998, "rewards/chosen": 0.9632936507936508, "rewards/margins": 6.655601343101344, "rewards/rejected": -5.6923076923076925, "step": 336 }, { "epoch": 0.2310593075077134, "grad_norm": 0.2830747963271367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133783834.48275863, "logits/rejected": 64652200.22857143, "logps/chosen": -225.3793103448276, "logps/rejected": -304.9142857142857, "loss": 0.1836, "rewards/chosen": 0.8663793103448276, "rewards/margins": 6.394950738916256, "rewards/rejected": -5.5285714285714285, "step": 337 }, { "epoch": 0.231744943435036, "grad_norm": 0.33265667516661945, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 171853104.43243244, "logits/rejected": 38447786.666666664, "logps/chosen": -256.43243243243245, "logps/rejected": -276.44444444444446, "loss": 0.234, "rewards/chosen": 1.1097972972972974, "rewards/margins": 5.906093593593594, "rewards/rejected": -4.796296296296297, "step": 338 }, { "epoch": 0.23243057936235859, "grad_norm": 0.19667554091524192, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155320320.0, "logits/rejected": 64815104.0, "logps/chosen": -268.25, "logps/rejected": -326.5, "loss": 0.1975, "rewards/chosen": 1.0537109375, "rewards/margins": 6.1240234375, "rewards/rejected": -5.0703125, "step": 339 }, { "epoch": 0.23311621528968118, "grad_norm": 0.2951251271396855, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145058390.64615384, "logits/rejected": 64645542.603174604, "logps/chosen": -254.76923076923077, "logps/rejected": -327.6190476190476, "loss": 0.2163, "rewards/chosen": 1.0427884615384615, "rewards/margins": 6.447550366300367, "rewards/rejected": -5.404761904761905, "step": 340 }, { "epoch": 0.23380185121700378, "grad_norm": 0.3048137140328591, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153878528.0, "logits/rejected": 87543808.0, "logps/chosen": -285.5, "logps/rejected": -436.75, "loss": 0.2283, "rewards/chosen": 0.9638671875, "rewards/margins": 6.3935546875, "rewards/rejected": -5.4296875, "step": 341 }, { "epoch": 0.23448748714432635, "grad_norm": 0.34314509021769146, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161247687.1111111, "logits/rejected": 98017658.09230769, "logps/chosen": -248.88888888888889, "logps/rejected": -372.67692307692306, "loss": 0.2307, "rewards/chosen": 1.1227678571428572, "rewards/margins": 6.161229395604396, "rewards/rejected": -5.038461538461538, "step": 342 }, { "epoch": 0.23517312307164895, "grad_norm": 0.267114054087205, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118537483.81538461, "logits/rejected": 93073603.04761904, "logps/chosen": -261.16923076923075, "logps/rejected": -305.015873015873, "loss": 0.1906, "rewards/chosen": 1.1942307692307692, "rewards/margins": 6.61486568986569, "rewards/rejected": -5.420634920634921, "step": 343 }, { "epoch": 0.23585875899897155, "grad_norm": 0.23228811532033272, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115978860.60606061, "logits/rejected": 71958528.0, "logps/chosen": -255.27272727272728, "logps/rejected": -345.03225806451616, "loss": 0.2012, "rewards/chosen": 1.2608901515151516, "rewards/margins": 4.974599828934506, "rewards/rejected": -3.713709677419355, "step": 344 }, { "epoch": 0.23654439492629414, "grad_norm": 0.19610085420709733, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95585980.63157895, "logits/rejected": 139652600.78873238, "logps/chosen": -191.01754385964912, "logps/rejected": -338.4788732394366, "loss": 0.2305, "rewards/chosen": 0.5709978070175439, "rewards/margins": 5.356209074623178, "rewards/rejected": -4.785211267605634, "step": 345 }, { "epoch": 0.23723003085361674, "grad_norm": 0.2763366981310897, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97754343.22580644, "logits/rejected": 109306104.24242425, "logps/chosen": -235.09677419354838, "logps/rejected": -339.8787878787879, "loss": 0.2135, "rewards/chosen": 0.8709677419354839, "rewards/margins": 6.158846529814272, "rewards/rejected": -5.287878787878788, "step": 346 }, { "epoch": 0.2379156667809393, "grad_norm": 0.21514763993450278, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128698906.94736843, "logits/rejected": 108933754.59154929, "logps/chosen": -234.94736842105263, "logps/rejected": -306.4788732394366, "loss": 0.2295, "rewards/chosen": 0.48355263157894735, "rewards/margins": 5.096228687916975, "rewards/rejected": -4.612676056338028, "step": 347 }, { "epoch": 0.2386013027082619, "grad_norm": 0.19714121247777974, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157286400.0, "logits/rejected": 98158364.44444445, "logps/chosen": -275.42857142857144, "logps/rejected": -317.77777777777777, "loss": 0.1771, "rewards/chosen": 1.4520089285714286, "rewards/margins": 6.820064484126984, "rewards/rejected": -5.368055555555555, "step": 348 }, { "epoch": 0.2392869386355845, "grad_norm": 0.27197720246026114, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113099039.43859649, "logits/rejected": 127483211.71830986, "logps/chosen": -254.31578947368422, "logps/rejected": -387.6056338028169, "loss": 0.1755, "rewards/chosen": 0.831140350877193, "rewards/margins": 4.795929083271559, "rewards/rejected": -3.964788732394366, "step": 349 }, { "epoch": 0.2399725745629071, "grad_norm": 0.2853301375169435, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125220269.41935484, "logits/rejected": 99328744.72727273, "logps/chosen": -213.16129032258064, "logps/rejected": -360.24242424242425, "loss": 0.1973, "rewards/chosen": 1.1663306451612903, "rewards/margins": 4.88602761485826, "rewards/rejected": -3.7196969696969697, "step": 350 }, { "epoch": 0.2406582104902297, "grad_norm": 0.24923828918715338, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134853228.6060606, "logits/rejected": 122243666.58064516, "logps/chosen": -270.54545454545456, "logps/rejected": -413.4193548387097, "loss": 0.2166, "rewards/chosen": 1.0568181818181819, "rewards/margins": 5.782624633431085, "rewards/rejected": -4.725806451612903, "step": 351 }, { "epoch": 0.24134384641755227, "grad_norm": 0.24957864101361268, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144197278.89655173, "logits/rejected": 96409073.37142856, "logps/chosen": -212.68965517241378, "logps/rejected": -364.8, "loss": 0.2243, "rewards/chosen": 0.6524784482758621, "rewards/margins": 5.752478448275862, "rewards/rejected": -5.1, "step": 352 }, { "epoch": 0.24202948234487487, "grad_norm": 0.20552316541389393, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134927058.82352942, "logits/rejected": 164836147.2, "logps/chosen": -219.2941176470588, "logps/rejected": -401.06666666666666, "loss": 0.2128, "rewards/chosen": 0.8115808823529411, "rewards/margins": 6.469914215686274, "rewards/rejected": -5.658333333333333, "step": 353 }, { "epoch": 0.24271511827219747, "grad_norm": 0.22310045430334763, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135107428.84848484, "logits/rejected": 132729426.58064516, "logps/chosen": -263.27272727272725, "logps/rejected": -394.3225806451613, "loss": 0.2301, "rewards/chosen": 0.7256155303030303, "rewards/margins": 6.854647788367546, "rewards/rejected": -6.129032258064516, "step": 354 }, { "epoch": 0.24340075419952006, "grad_norm": 0.23602907222298955, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135639525.9661017, "logits/rejected": 93429641.27536231, "logps/chosen": -202.3050847457627, "logps/rejected": -344.57971014492756, "loss": 0.1924, "rewards/chosen": 1.2584745762711864, "rewards/margins": 6.388909358879882, "rewards/rejected": -5.130434782608695, "step": 355 }, { "epoch": 0.24408639012684263, "grad_norm": 0.17729338215101445, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99098257.19402985, "logits/rejected": 111561610.49180327, "logps/chosen": -194.62686567164178, "logps/rejected": -303.4754098360656, "loss": 0.2198, "rewards/chosen": 0.5942164179104478, "rewards/margins": 6.020445926107169, "rewards/rejected": -5.426229508196721, "step": 356 }, { "epoch": 0.24477202605416523, "grad_norm": 0.18258929889221737, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154927104.0, "logits/rejected": 127401984.0, "logps/chosen": -267.0, "logps/rejected": -323.25, "loss": 0.2362, "rewards/chosen": 0.70556640625, "rewards/margins": 5.98681640625, "rewards/rejected": -5.28125, "step": 357 }, { "epoch": 0.24545766198148783, "grad_norm": 0.15520992180239096, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121266894.59649123, "logits/rejected": 84299602.92957747, "logps/chosen": -242.24561403508773, "logps/rejected": -290.92957746478874, "loss": 0.2035, "rewards/chosen": 0.756578947368421, "rewards/margins": 5.883339510748703, "rewards/rejected": -5.126760563380282, "step": 358 }, { "epoch": 0.24614329790881043, "grad_norm": 0.2786434603838572, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167772160.0, "logits/rejected": 93053630.17142858, "logps/chosen": -221.3793103448276, "logps/rejected": -410.51428571428573, "loss": 0.2198, "rewards/chosen": 0.6767241379310345, "rewards/margins": 5.676724137931035, "rewards/rejected": -5.0, "step": 359 }, { "epoch": 0.24682893383613302, "grad_norm": 0.2124587215740117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127779103.43859649, "logits/rejected": 75216867.15492958, "logps/chosen": -184.140350877193, "logps/rejected": -299.71830985915494, "loss": 0.1838, "rewards/chosen": 0.6129385964912281, "rewards/margins": 7.035473807758834, "rewards/rejected": -6.422535211267606, "step": 360 }, { "epoch": 0.2475145697634556, "grad_norm": 0.30737229579916836, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118731067.07692307, "logits/rejected": 149130808.8888889, "logps/chosen": -285.04615384615386, "logps/rejected": -371.3015873015873, "loss": 0.2033, "rewards/chosen": 1.3903846153846153, "rewards/margins": 6.993559218559218, "rewards/rejected": -5.603174603174603, "step": 361 }, { "epoch": 0.2482002056907782, "grad_norm": 0.2375487404317604, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73941520.51612903, "logits/rejected": 103920236.60606061, "logps/chosen": -193.03225806451613, "logps/rejected": -334.7878787878788, "loss": 0.1799, "rewards/chosen": 1.4294354838709677, "rewards/margins": 6.618829423264907, "rewards/rejected": -5.1893939393939394, "step": 362 }, { "epoch": 0.2488858416181008, "grad_norm": 0.2121138222313415, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149676734.17142856, "logits/rejected": 64803804.68965517, "logps/chosen": -243.42857142857142, "logps/rejected": -339.58620689655174, "loss": 0.2076, "rewards/chosen": 1.2446428571428572, "rewards/margins": 6.839470443349754, "rewards/rejected": -5.594827586206897, "step": 363 }, { "epoch": 0.2495714775454234, "grad_norm": 0.2542400821632652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147238850.86567163, "logits/rejected": 89429780.98360656, "logps/chosen": -230.92537313432837, "logps/rejected": -333.11475409836066, "loss": 0.1981, "rewards/chosen": 1.1819029850746268, "rewards/margins": 5.64091937851725, "rewards/rejected": -4.459016393442623, "step": 364 }, { "epoch": 0.250257113472746, "grad_norm": 0.22173484014070058, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138352957.29577464, "logits/rejected": 94040710.73684211, "logps/chosen": -245.18309859154928, "logps/rejected": -334.5964912280702, "loss": 0.2142, "rewards/chosen": 1.1892605633802817, "rewards/margins": 6.013821966889054, "rewards/rejected": -4.824561403508772, "step": 365 }, { "epoch": 0.25094274940006855, "grad_norm": 0.19806488837384242, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128287849.93103448, "logits/rejected": 92813955.65714286, "logps/chosen": -251.0344827586207, "logps/rejected": -341.25714285714287, "loss": 0.1985, "rewards/chosen": 0.6729525862068966, "rewards/margins": 6.41580972906404, "rewards/rejected": -5.742857142857143, "step": 366 }, { "epoch": 0.2516283853273912, "grad_norm": 0.2978877815676007, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113926365.4054054, "logits/rejected": 155810626.37037036, "logps/chosen": -229.83783783783784, "logps/rejected": -423.7037037037037, "loss": 0.2303, "rewards/chosen": 0.9079391891891891, "rewards/margins": 5.421828078078079, "rewards/rejected": -4.513888888888889, "step": 367 }, { "epoch": 0.25231402125471375, "grad_norm": 0.20227482697783047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 177649069.41935483, "logits/rejected": 54081101.57575758, "logps/chosen": -265.80645161290323, "logps/rejected": -304.0, "loss": 0.1912, "rewards/chosen": 1.5257056451612903, "rewards/margins": 6.783281402737048, "rewards/rejected": -5.257575757575758, "step": 368 }, { "epoch": 0.2529996571820363, "grad_norm": 0.3823221866600182, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148707141.8181818, "logits/rejected": 113516808.25806452, "logps/chosen": -232.0, "logps/rejected": -390.7096774193548, "loss": 0.2178, "rewards/chosen": 0.8882575757575758, "rewards/margins": 6.702773704789834, "rewards/rejected": -5.814516129032258, "step": 369 }, { "epoch": 0.25368529310935894, "grad_norm": 0.1997143919285234, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116263163.50877193, "logits/rejected": 76058681.69014084, "logps/chosen": -183.01754385964912, "logps/rejected": -307.15492957746477, "loss": 0.1799, "rewards/chosen": 1.1217105263157894, "rewards/margins": 6.621710526315789, "rewards/rejected": -5.5, "step": 370 }, { "epoch": 0.2543709290366815, "grad_norm": 0.2233803044347286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117034611.61290322, "logits/rejected": 104412749.57575758, "logps/chosen": -226.32258064516128, "logps/rejected": -361.2121212121212, "loss": 0.1671, "rewards/chosen": 1.1149193548387097, "rewards/margins": 7.054313294232649, "rewards/rejected": -5.9393939393939394, "step": 371 }, { "epoch": 0.25505656496400414, "grad_norm": 0.31286593480573816, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143335780.17391303, "logits/rejected": 136137155.2542373, "logps/chosen": -292.6376811594203, "logps/rejected": -414.3728813559322, "loss": 0.2099, "rewards/chosen": 1.1684782608695652, "rewards/margins": 7.397291820191599, "rewards/rejected": -6.228813559322034, "step": 372 }, { "epoch": 0.2557422008913267, "grad_norm": 0.18302014165873529, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162889238.92537314, "logits/rejected": 48062598.295081966, "logps/chosen": -286.32835820895525, "logps/rejected": -350.6885245901639, "loss": 0.2008, "rewards/chosen": 1.4412313432835822, "rewards/margins": 7.219919867873746, "rewards/rejected": -5.778688524590164, "step": 373 }, { "epoch": 0.2564278368186493, "grad_norm": 0.1714941529077624, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 174448807.18367347, "logits/rejected": 77037153.21518987, "logps/chosen": -223.0204081632653, "logps/rejected": -335.39240506329116, "loss": 0.1786, "rewards/chosen": 0.9375, "rewards/margins": 6.557753164556962, "rewards/rejected": -5.620253164556962, "step": 374 }, { "epoch": 0.2571134727459719, "grad_norm": 0.19518004415800919, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127707166.56716418, "logits/rejected": 114827666.8852459, "logps/chosen": -225.43283582089552, "logps/rejected": -374.55737704918033, "loss": 0.2019, "rewards/chosen": 0.9981343283582089, "rewards/margins": 7.285019574259849, "rewards/rejected": -6.286885245901639, "step": 375 }, { "epoch": 0.2577991086732945, "grad_norm": 0.17612389921704447, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81418842.35294117, "logits/rejected": 137293550.93333334, "logps/chosen": -220.47058823529412, "logps/rejected": -401.6, "loss": 0.2006, "rewards/chosen": 1.0147058823529411, "rewards/margins": 7.156372549019608, "rewards/rejected": -6.141666666666667, "step": 376 }, { "epoch": 0.25848474460061704, "grad_norm": 0.21023098546616206, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152193316.57142857, "logits/rejected": 86918884.43076923, "logps/chosen": -262.6031746031746, "logps/rejected": -349.04615384615386, "loss": 0.2162, "rewards/chosen": 0.8948412698412699, "rewards/margins": 6.244841269841269, "rewards/rejected": -5.35, "step": 377 }, { "epoch": 0.25917038052793967, "grad_norm": 0.21794692132975044, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87653844.61016949, "logits/rejected": 146131982.84057972, "logps/chosen": -222.64406779661016, "logps/rejected": -438.2608695652174, "loss": 0.1744, "rewards/chosen": 0.8432203389830508, "rewards/margins": 7.0243797592729065, "rewards/rejected": -6.181159420289855, "step": 378 }, { "epoch": 0.25985601645526224, "grad_norm": 0.2063049873422324, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161207162.4347826, "logits/rejected": 83388450.71186441, "logps/chosen": -314.4347826086956, "logps/rejected": -352.0, "loss": 0.2391, "rewards/chosen": 0.9438688858695652, "rewards/margins": 6.579462106208548, "rewards/rejected": -5.635593220338983, "step": 379 }, { "epoch": 0.26054165238258487, "grad_norm": 0.1508002531101751, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91599333.96610169, "logits/rejected": 128564535.6521739, "logps/chosen": -202.3050847457627, "logps/rejected": -317.2173913043478, "loss": 0.2065, "rewards/chosen": 0.608249470338983, "rewards/margins": 4.825640774686809, "rewards/rejected": -4.217391304347826, "step": 380 }, { "epoch": 0.26122728830990743, "grad_norm": 0.20302258962671776, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103735180.61971831, "logits/rejected": 137013930.66666666, "logps/chosen": -230.30985915492957, "logps/rejected": -322.8070175438597, "loss": 0.1985, "rewards/chosen": 1.158450704225352, "rewards/margins": 7.33388930071658, "rewards/rejected": -6.175438596491228, "step": 381 }, { "epoch": 0.26191292423723, "grad_norm": 0.18313568068246466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 186356153.1076923, "logits/rejected": 56323510.85714286, "logps/chosen": -190.03076923076924, "logps/rejected": -340.8253968253968, "loss": 0.192, "rewards/chosen": 1.0653846153846154, "rewards/margins": 7.02967032967033, "rewards/rejected": -5.964285714285714, "step": 382 }, { "epoch": 0.26259856016455263, "grad_norm": 0.23443007480977737, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128192108.16901408, "logits/rejected": 76509255.85964912, "logps/chosen": -206.64788732394365, "logps/rejected": -341.89473684210526, "loss": 0.2018, "rewards/chosen": 1.4242957746478873, "rewards/margins": 5.025172967630343, "rewards/rejected": -3.6008771929824563, "step": 383 }, { "epoch": 0.2632841960918752, "grad_norm": 0.26744951060652666, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151682534.81967214, "logits/rejected": 39971091.104477614, "logps/chosen": -189.11475409836066, "logps/rejected": -305.1940298507463, "loss": 0.1733, "rewards/chosen": 1.1029713114754098, "rewards/margins": 7.005956386102276, "rewards/rejected": -5.902985074626866, "step": 384 }, { "epoch": 0.2639698320191978, "grad_norm": 0.2100303291051179, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80865182.47619048, "logits/rejected": 152285499.07692307, "logps/chosen": -196.57142857142858, "logps/rejected": -324.9230769230769, "loss": 0.1999, "rewards/chosen": 0.6378968253968254, "rewards/margins": 7.260973748473748, "rewards/rejected": -6.623076923076923, "step": 385 }, { "epoch": 0.2646554679465204, "grad_norm": 0.19621392443990887, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143305386.66666666, "logits/rejected": 94433520.94117647, "logps/chosen": -248.93333333333334, "logps/rejected": -368.47058823529414, "loss": 0.1719, "rewards/chosen": 1.3697916666666667, "rewards/margins": 7.891850490196079, "rewards/rejected": -6.522058823529412, "step": 386 }, { "epoch": 0.26534110387384297, "grad_norm": 0.17889899163298942, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 210994818.16949153, "logits/rejected": 79934924.05797102, "logps/chosen": -266.03389830508473, "logps/rejected": -343.18840579710144, "loss": 0.191, "rewards/chosen": 1.0921610169491525, "rewards/margins": 7.026943625644805, "rewards/rejected": -5.934782608695652, "step": 387 }, { "epoch": 0.2660267398011656, "grad_norm": 0.24750222075294268, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150624858.3529412, "logits/rejected": 96399086.93333334, "logps/chosen": -293.6470588235294, "logps/rejected": -382.93333333333334, "loss": 0.1909, "rewards/chosen": 1.7463235294117647, "rewards/margins": -14453057.45367647, "rewards/rejected": 14453059.2, "step": 388 }, { "epoch": 0.26671237572848816, "grad_norm": 0.23309823824173742, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119892566.64615385, "logits/rejected": 147666131.3015873, "logps/chosen": -240.4923076923077, "logps/rejected": -411.93650793650795, "loss": 0.1941, "rewards/chosen": 1.0711538461538461, "rewards/margins": 7.817185592185592, "rewards/rejected": -6.746031746031746, "step": 389 }, { "epoch": 0.2673980116558108, "grad_norm": 0.2596102861587467, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110250276.57142857, "logits/rejected": 116279012.43076923, "logps/chosen": -229.5873015873016, "logps/rejected": -370.2153846153846, "loss": 0.1829, "rewards/chosen": 0.9325396825396826, "rewards/margins": 7.463308913308913, "rewards/rejected": -6.530769230769231, "step": 390 }, { "epoch": 0.26808364758313336, "grad_norm": 0.22406479943057286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121189965.57575758, "logits/rejected": 104384049.5483871, "logps/chosen": -241.93939393939394, "logps/rejected": -338.5806451612903, "loss": 0.1903, "rewards/chosen": 1.2708333333333333, "rewards/margins": 6.706317204301075, "rewards/rejected": -5.435483870967742, "step": 391 }, { "epoch": 0.2687692835104559, "grad_norm": 0.21185950175743323, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135083219.3015873, "logits/rejected": 126538925.29230769, "logps/chosen": -315.42857142857144, "logps/rejected": -383.5076923076923, "loss": 0.2104, "rewards/chosen": 0.8670634920634921, "rewards/margins": 7.197832722832723, "rewards/rejected": -6.3307692307692305, "step": 392 }, { "epoch": 0.26945491943777855, "grad_norm": 0.2293512658832887, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121812040.11267605, "logits/rejected": 140693144.7017544, "logps/chosen": -226.25352112676057, "logps/rejected": -444.0701754385965, "loss": 0.208, "rewards/chosen": 0.9929577464788732, "rewards/margins": 6.747343711391154, "rewards/rejected": -5.754385964912281, "step": 393 }, { "epoch": 0.2701405553651011, "grad_norm": 0.26167091364979805, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148616085.01492536, "logits/rejected": 121909852.32786885, "logps/chosen": -266.9850746268657, "logps/rejected": -346.75409836065575, "loss": 0.2044, "rewards/chosen": 0.6100746268656716, "rewards/margins": 6.3354844629312455, "rewards/rejected": -5.725409836065574, "step": 394 }, { "epoch": 0.27082619129242375, "grad_norm": 0.20760281253096372, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104577979.73333333, "logits/rejected": 124842224.94117647, "logps/chosen": -214.13333333333333, "logps/rejected": -351.05882352941177, "loss": 0.209, "rewards/chosen": 0.6557291666666667, "rewards/margins": 6.751317401960785, "rewards/rejected": -6.095588235294118, "step": 395 }, { "epoch": 0.2715118272197463, "grad_norm": 0.2247940651098544, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128205892.26666667, "logits/rejected": 81973970.8235294, "logps/chosen": -210.66666666666666, "logps/rejected": -311.7647058823529, "loss": 0.1985, "rewards/chosen": 0.9416666666666667, "rewards/margins": 6.684313725490195, "rewards/rejected": -5.742647058823529, "step": 396 }, { "epoch": 0.2721974631470689, "grad_norm": 0.18683007548264444, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83691899.25925925, "logits/rejected": 126282558.27027027, "logps/chosen": -229.92592592592592, "logps/rejected": -312.64864864864865, "loss": 0.1786, "rewards/chosen": 1.1024305555555556, "rewards/margins": 6.730808933933933, "rewards/rejected": -5.628378378378378, "step": 397 }, { "epoch": 0.2728830990743915, "grad_norm": 0.33720847265790477, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128049633.88235295, "logits/rejected": 99125384.53333333, "logps/chosen": -278.3529411764706, "logps/rejected": -338.6666666666667, "loss": 0.2053, "rewards/chosen": 1.3363970588235294, "rewards/margins": 5.7280637254901965, "rewards/rejected": -4.391666666666667, "step": 398 }, { "epoch": 0.2735687350017141, "grad_norm": 0.2945344838954198, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135983750.7368421, "logits/rejected": 57597836.61971831, "logps/chosen": -217.5438596491228, "logps/rejected": -367.32394366197184, "loss": 0.1991, "rewards/chosen": 0.7834429824561403, "rewards/margins": 7.346823264146281, "rewards/rejected": -6.563380281690141, "step": 399 }, { "epoch": 0.2742543709290367, "grad_norm": 0.17517010790950638, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122391165.90163934, "logits/rejected": 109051904.0, "logps/chosen": -235.14754098360655, "logps/rejected": -333.85074626865674, "loss": 0.1916, "rewards/chosen": 1.3017418032786885, "rewards/margins": 7.734577624174211, "rewards/rejected": -6.432835820895522, "step": 400 }, { "epoch": 0.2749400068563593, "grad_norm": 0.178900831927598, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102286897.5483871, "logits/rejected": 160400352.96969697, "logps/chosen": -223.2258064516129, "logps/rejected": -435.8787878787879, "loss": 0.1769, "rewards/chosen": 1.3558467741935485, "rewards/margins": 8.174028592375366, "rewards/rejected": -6.818181818181818, "step": 401 }, { "epoch": 0.27562564278368185, "grad_norm": 0.19664061166804594, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123850674.71698113, "logits/rejected": 117664208.21333334, "logps/chosen": -174.64150943396226, "logps/rejected": -371.62666666666667, "loss": 0.1376, "rewards/chosen": 1.3714622641509433, "rewards/margins": 7.778128930817609, "rewards/rejected": -6.406666666666666, "step": 402 }, { "epoch": 0.27631127871100447, "grad_norm": 0.2255404317969975, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126280814.27692308, "logits/rejected": 50198495.49206349, "logps/chosen": -222.4, "logps/rejected": -293.07936507936506, "loss": 0.2133, "rewards/chosen": 1.0182692307692307, "rewards/margins": 7.200808913308913, "rewards/rejected": -6.182539682539683, "step": 403 }, { "epoch": 0.27699691463832704, "grad_norm": 0.16475443647528282, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104146701.01694915, "logits/rejected": 105039961.04347827, "logps/chosen": -199.59322033898306, "logps/rejected": -342.72463768115944, "loss": 0.1899, "rewards/chosen": 1.1133474576271187, "rewards/margins": 7.533637312699582, "rewards/rejected": -6.420289855072464, "step": 404 }, { "epoch": 0.27768255056564967, "grad_norm": 0.24316870489791378, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148627191.7419355, "logits/rejected": 96087691.63636364, "logps/chosen": -245.41935483870967, "logps/rejected": -364.8484848484849, "loss": 0.2067, "rewards/chosen": 0.7752016129032258, "rewards/margins": 6.222171309872923, "rewards/rejected": -5.446969696969697, "step": 405 }, { "epoch": 0.27836818649297224, "grad_norm": 0.2595441607023108, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138205754.75409836, "logits/rejected": 76201739.46268657, "logps/chosen": -234.75409836065575, "logps/rejected": -365.97014925373134, "loss": 0.2004, "rewards/chosen": 0.9948770491803278, "rewards/margins": 6.274727795448984, "rewards/rejected": -5.279850746268656, "step": 406 }, { "epoch": 0.2790538224202948, "grad_norm": 0.20218800323512165, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105467680.58181818, "logits/rejected": 110430853.26027398, "logps/chosen": -185.74545454545455, "logps/rejected": -429.5890410958904, "loss": 0.1713, "rewards/chosen": 1.1181818181818182, "rewards/margins": 6.830510585305106, "rewards/rejected": -5.712328767123288, "step": 407 }, { "epoch": 0.27973945834761743, "grad_norm": 0.21301732775496565, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99310294.70967741, "logits/rejected": 78627312.48484848, "logps/chosen": -193.80645161290323, "logps/rejected": -332.6060606060606, "loss": 0.1941, "rewards/chosen": 1.0685483870967742, "rewards/margins": -23863688.628421307, "rewards/rejected": 23863689.696969695, "step": 408 }, { "epoch": 0.28042509427494, "grad_norm": 0.1642019684512107, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100434515.78181818, "logits/rejected": 127265525.47945206, "logps/chosen": -189.6727272727273, "logps/rejected": -342.7945205479452, "loss": 0.1792, "rewards/chosen": 0.9926136363636363, "rewards/margins": 6.896723225404732, "rewards/rejected": -5.904109589041096, "step": 409 }, { "epoch": 0.28111073020226257, "grad_norm": 0.32665453037437575, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82903040.0, "logits/rejected": 105709568.0, "logps/chosen": -202.5, "logps/rejected": -276.5, "loss": 0.1826, "rewards/chosen": 1.1005859375, "rewards/margins": 6.0146484375, "rewards/rejected": -4.9140625, "step": 410 }, { "epoch": 0.2817963661295852, "grad_norm": 0.2837622959797491, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149596842.66666666, "logits/rejected": 62915002.81081081, "logps/chosen": -219.7037037037037, "logps/rejected": -316.1081081081081, "loss": 0.1841, "rewards/chosen": 1.0914351851851851, "rewards/margins": 6.66575950950951, "rewards/rejected": -5.574324324324325, "step": 411 }, { "epoch": 0.28248200205690777, "grad_norm": 0.26474716348733685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150220610.95384616, "logits/rejected": 107320921.3968254, "logps/chosen": -305.2307692307692, "logps/rejected": -357.58730158730157, "loss": 0.2056, "rewards/chosen": 0.9471153846153846, "rewards/margins": 7.4947344322344325, "rewards/rejected": -6.5476190476190474, "step": 412 }, { "epoch": 0.2831676379842304, "grad_norm": 0.4850227891731547, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125118794.32258065, "logits/rejected": 80772127.03030303, "logps/chosen": -219.74193548387098, "logps/rejected": -368.0, "loss": 0.2137, "rewards/chosen": 1.0272177419354838, "rewards/margins": 6.193884408602151, "rewards/rejected": -5.166666666666667, "step": 413 }, { "epoch": 0.28385327391155296, "grad_norm": 0.24078799993984037, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148741288.11940297, "logits/rejected": 67822239.47540984, "logps/chosen": -238.2089552238806, "logps/rejected": -368.78688524590166, "loss": 0.1998, "rewards/chosen": 1.2835820895522387, "rewards/margins": 7.308172253486664, "rewards/rejected": -6.024590163934426, "step": 414 }, { "epoch": 0.28453890983887553, "grad_norm": 0.25490312340723986, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98633794.06451613, "logits/rejected": 91321437.0909091, "logps/chosen": -191.74193548387098, "logps/rejected": -364.3636363636364, "loss": 0.1898, "rewards/chosen": 1.0443548387096775, "rewards/margins": 7.521627565982405, "rewards/rejected": -6.4772727272727275, "step": 415 }, { "epoch": 0.28522454576619816, "grad_norm": 0.31178703651675227, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109655629.57575758, "logits/rejected": 136788430.4516129, "logps/chosen": -244.12121212121212, "logps/rejected": -392.7741935483871, "loss": 0.1872, "rewards/chosen": 1.178030303030303, "rewards/margins": 7.040933528836755, "rewards/rejected": -5.862903225806452, "step": 416 }, { "epoch": 0.2859101816935207, "grad_norm": 0.24065998985133316, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125109202.14925373, "logits/rejected": 117096716.59016393, "logps/chosen": -195.5820895522388, "logps/rejected": -349.37704918032784, "loss": 0.1985, "rewards/chosen": 0.8987873134328358, "rewards/margins": 7.48075452654759, "rewards/rejected": -6.581967213114754, "step": 417 }, { "epoch": 0.28659581762084335, "grad_norm": 0.2264993746152496, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 165611457.93939394, "logits/rejected": 92308513.03225806, "logps/chosen": -270.7878787878788, "logps/rejected": -420.64516129032256, "loss": 0.1905, "rewards/chosen": 1.1515151515151516, "rewards/margins": 8.167644183773216, "rewards/rejected": -7.016129032258065, "step": 418 }, { "epoch": 0.2872814535481659, "grad_norm": 0.2921004310347814, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104620824.77419356, "logits/rejected": 104857600.0, "logps/chosen": -184.25806451612902, "logps/rejected": -343.75757575757575, "loss": 0.1936, "rewards/chosen": 0.6985887096774194, "rewards/margins": 4.853891739980449, "rewards/rejected": -4.15530303030303, "step": 419 }, { "epoch": 0.2879670894754885, "grad_norm": 0.2930575337539008, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81089877.33333333, "logits/rejected": 108375403.35483871, "logps/chosen": -226.9090909090909, "logps/rejected": -366.4516129032258, "loss": 0.1918, "rewards/chosen": 1.1799242424242424, "rewards/margins": 7.534762952101661, "rewards/rejected": -6.354838709677419, "step": 420 }, { "epoch": 0.2886527254028111, "grad_norm": 0.24313501139514912, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130772406.85714285, "logits/rejected": 175228700.44444445, "logps/chosen": -265.42857142857144, "logps/rejected": -441.77777777777777, "loss": 0.1746, "rewards/chosen": 0.6037946428571429, "rewards/margins": 6.569072420634921, "rewards/rejected": -5.965277777777778, "step": 421 }, { "epoch": 0.2893383613301337, "grad_norm": 0.19082066734784384, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 196164371.69230768, "logits/rejected": 62148933.07936508, "logps/chosen": -259.9384615384615, "logps/rejected": -322.2857142857143, "loss": 0.1913, "rewards/chosen": 0.49230769230769234, "rewards/margins": 5.460561660561661, "rewards/rejected": -4.968253968253968, "step": 422 }, { "epoch": 0.2900239972574563, "grad_norm": 0.20511148168025534, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114479826.8235294, "logits/rejected": 91855257.6, "logps/chosen": -213.41176470588235, "logps/rejected": -329.06666666666666, "loss": 0.2031, "rewards/chosen": 1.072610294117647, "rewards/margins": 6.96844362745098, "rewards/rejected": -5.895833333333333, "step": 423 }, { "epoch": 0.2907096331847789, "grad_norm": 0.1793203475424562, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125369470.24657534, "logits/rejected": 127125541.23636363, "logps/chosen": -250.3013698630137, "logps/rejected": -356.07272727272726, "loss": 0.2137, "rewards/chosen": 1.2268835616438356, "rewards/margins": 7.5541562889165625, "rewards/rejected": -6.327272727272727, "step": 424 }, { "epoch": 0.29139526911210145, "grad_norm": 0.24563043960219041, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144826849.88235295, "logits/rejected": 92973738.66666667, "logps/chosen": -316.70588235294116, "logps/rejected": -337.3333333333333, "loss": 0.2053, "rewards/chosen": 1.2201286764705883, "rewards/margins": 7.345128676470589, "rewards/rejected": -6.125, "step": 425 }, { "epoch": 0.2920809050394241, "grad_norm": 0.3026429630469364, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116129792.0, "logits/rejected": 72548352.0, "logps/chosen": -219.0, "logps/rejected": -332.0, "loss": 0.2204, "rewards/chosen": 1.05810546875, "rewards/margins": 6.86279296875, "rewards/rejected": -5.8046875, "step": 426 }, { "epoch": 0.29276654096674665, "grad_norm": 0.2160594207837742, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 217595407.5151515, "logits/rejected": 46610894.451612905, "logps/chosen": -279.75757575757575, "logps/rejected": -354.3225806451613, "loss": 0.2055, "rewards/chosen": 1.152462121212121, "rewards/margins": 14595837.023429863, "rewards/rejected": -14595835.870967742, "step": 427 }, { "epoch": 0.29345217689406927, "grad_norm": 0.18485991999390228, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 183500800.0, "logits/rejected": 51292160.0, "logps/chosen": -262.0, "logps/rejected": -313.0, "loss": 0.2003, "rewards/chosen": 0.84130859375, "rewards/margins": 7.00537109375, "rewards/rejected": -6.1640625, "step": 428 }, { "epoch": 0.29413781282139184, "grad_norm": 0.1900468087542673, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89097184.96969697, "logits/rejected": 109863704.77419356, "logps/chosen": -189.45454545454547, "logps/rejected": -375.741935483871, "loss": 0.1924, "rewards/chosen": 0.9706439393939394, "rewards/margins": 6.9787084555229715, "rewards/rejected": -6.008064516129032, "step": 429 }, { "epoch": 0.2948234487487144, "grad_norm": 0.21895355257634344, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124504602.94736843, "logits/rejected": 88725661.53846154, "logps/chosen": -277.4736842105263, "logps/rejected": -303.38461538461536, "loss": 0.2647, "rewards/chosen": 0.6792763157894737, "rewards/margins": -1973622.3976467613, "rewards/rejected": 1973623.076923077, "step": 430 }, { "epoch": 0.29550908467603704, "grad_norm": 0.2666952661868896, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114411292.44444445, "logits/rejected": 64075483.428571425, "logps/chosen": -226.66666666666666, "logps/rejected": -329.42857142857144, "loss": 0.203, "rewards/chosen": 1.4192708333333333, "rewards/margins": 6.843377976190476, "rewards/rejected": -5.424107142857143, "step": 431 }, { "epoch": 0.2961947206033596, "grad_norm": 0.17954494635293952, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102341017.6, "logits/rejected": 129300268.13793103, "logps/chosen": -188.8, "logps/rejected": -392.82758620689657, "loss": 0.2274, "rewards/chosen": 0.6988839285714286, "rewards/margins": 6.9144011699507395, "rewards/rejected": -6.2155172413793105, "step": 432 }, { "epoch": 0.29688035653068223, "grad_norm": 0.3010052759544202, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111757906.58064516, "logits/rejected": 127354321.45454545, "logps/chosen": -268.1290322580645, "logps/rejected": -388.3636363636364, "loss": 0.1938, "rewards/chosen": 1.094758064516129, "rewards/margins": 7.299303519061583, "rewards/rejected": -6.204545454545454, "step": 433 }, { "epoch": 0.2975659924580048, "grad_norm": 0.23657251095082155, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 175073355.85185185, "logits/rejected": 68653388.1081081, "logps/chosen": -266.3703703703704, "logps/rejected": -342.4864864864865, "loss": 0.1738, "rewards/chosen": 0.8888888888888888, "rewards/margins": 7.4024024024024015, "rewards/rejected": -6.513513513513513, "step": 434 }, { "epoch": 0.29825162838532737, "grad_norm": 0.22222346658211573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117768192.0, "logits/rejected": 134916778.66666666, "logps/chosen": -230.0, "logps/rejected": -391.55555555555554, "loss": 0.1831, "rewards/chosen": 1.2756696428571428, "rewards/margins": 5.831225198412698, "rewards/rejected": -4.555555555555555, "step": 435 }, { "epoch": 0.29893726431265, "grad_norm": 0.23598292573062035, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156396699.15151516, "logits/rejected": 32573506.06451613, "logps/chosen": -228.24242424242425, "logps/rejected": -298.06451612903226, "loss": 0.1889, "rewards/chosen": 1.2613636363636365, "rewards/margins": 7.104105571847508, "rewards/rejected": -5.842741935483871, "step": 436 }, { "epoch": 0.29962290023997257, "grad_norm": 0.3074100720269893, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117558661.40845071, "logits/rejected": 111700938.10526316, "logps/chosen": -199.2112676056338, "logps/rejected": -389.3333333333333, "loss": 0.1945, "rewards/chosen": 0.9630281690140845, "rewards/margins": -26258.493112181863, "rewards/rejected": 26259.456140350878, "step": 437 }, { "epoch": 0.3003085361672952, "grad_norm": 0.23243486748907513, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 194336085.33333334, "logits/rejected": 68800115.61290322, "logps/chosen": -290.6666666666667, "logps/rejected": -362.3225806451613, "loss": 0.1783, "rewards/chosen": 1.2859848484848484, "rewards/margins": 7.334371945259043, "rewards/rejected": -6.048387096774194, "step": 438 }, { "epoch": 0.30099417209461776, "grad_norm": 0.2551972897928112, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124660215.60655738, "logits/rejected": 89864528.23880596, "logps/chosen": -216.52459016393442, "logps/rejected": -366.32835820895525, "loss": 0.1627, "rewards/chosen": 1.1280737704918034, "rewards/margins": 7.5161334719843405, "rewards/rejected": -6.388059701492537, "step": 439 }, { "epoch": 0.30167980802194033, "grad_norm": 0.28199833223952686, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138561828.57142857, "logits/rejected": 69322524.44444445, "logps/chosen": -271.7142857142857, "logps/rejected": -328.0, "loss": 0.1869, "rewards/chosen": 0.85546875, "rewards/margins": 4.098524305555555, "rewards/rejected": -3.2430555555555554, "step": 440 }, { "epoch": 0.30236544394926296, "grad_norm": 0.23468289871311335, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120658060.2739726, "logits/rejected": 113474988.21818182, "logps/chosen": -232.76712328767124, "logps/rejected": -349.6727272727273, "loss": 0.2137, "rewards/chosen": 1.1087328767123288, "rewards/margins": 7.545096513075965, "rewards/rejected": -6.4363636363636365, "step": 441 }, { "epoch": 0.3030510798765855, "grad_norm": 0.2094714003320075, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143983570.14925373, "logits/rejected": 57293505.04918033, "logps/chosen": -262.44776119402985, "logps/rejected": -375.8688524590164, "loss": 0.1937, "rewards/chosen": 1.0690298507462686, "rewards/margins": 7.855915096647908, "rewards/rejected": -6.786885245901639, "step": 442 }, { "epoch": 0.3037367158039081, "grad_norm": 0.8026515836973144, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112047835.42857143, "logits/rejected": 156346297.37931034, "logps/chosen": -300.57142857142856, "logps/rejected": -357.7931034482759, "loss": 0.1813, "rewards/chosen": 1.6482142857142856, "rewards/margins": 7.669766009852216, "rewards/rejected": -6.021551724137931, "step": 443 }, { "epoch": 0.3044223517312307, "grad_norm": 0.6947587292593924, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89128960.0, "logits/rejected": 127270912.0, "logps/chosen": -213.0, "logps/rejected": -368.5, "loss": 0.1685, "rewards/chosen": 1.26953125, "rewards/margins": 7.44921875, "rewards/rejected": -6.1796875, "step": 444 }, { "epoch": 0.3051079876585533, "grad_norm": 0.18873586285133898, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119347013.81818181, "logits/rejected": 106616501.67741935, "logps/chosen": -231.5151515151515, "logps/rejected": -296.258064516129, "loss": 0.2143, "rewards/chosen": 0.9583333333333334, "rewards/margins": 7.071236559139785, "rewards/rejected": -6.112903225806452, "step": 445 }, { "epoch": 0.3057936235858759, "grad_norm": 0.24970210840753504, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89960589.2413793, "logits/rejected": 132540006.4, "logps/chosen": -198.89655172413794, "logps/rejected": -394.0571428571429, "loss": 0.1793, "rewards/chosen": 0.978448275862069, "rewards/margins": 5.742733990147784, "rewards/rejected": -4.764285714285714, "step": 446 }, { "epoch": 0.3064792595131985, "grad_norm": 0.17174680593997949, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129710416.23880596, "logits/rejected": 145838012.852459, "logps/chosen": -231.6417910447761, "logps/rejected": -339.9344262295082, "loss": 0.2092, "rewards/chosen": 0.5923507462686567, "rewards/margins": 6.551367139711279, "rewards/rejected": -5.959016393442623, "step": 447 }, { "epoch": 0.30716489544052106, "grad_norm": 0.2071176983751381, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152507643.80327868, "logits/rejected": 115186856.11940299, "logps/chosen": -201.18032786885246, "logps/rejected": -353.910447761194, "loss": 0.1809, "rewards/chosen": 1.1065573770491803, "rewards/margins": 6.912527526302911, "rewards/rejected": -5.8059701492537314, "step": 448 }, { "epoch": 0.3078505313678437, "grad_norm": 0.19468742032551442, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145529638.78787878, "logits/rejected": 110946105.80645162, "logps/chosen": -204.6060606060606, "logps/rejected": -364.38709677419354, "loss": 0.1953, "rewards/chosen": 1.0009469696969697, "rewards/margins": 6.799334066471164, "rewards/rejected": -5.798387096774194, "step": 449 }, { "epoch": 0.30853616729516625, "grad_norm": 0.1852041452234642, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131997214.11764705, "logits/rejected": 69468160.0, "logps/chosen": -205.1764705882353, "logps/rejected": -326.1333333333333, "loss": 0.2127, "rewards/chosen": 0.9319852941176471, "rewards/margins": 5.61531862745098, "rewards/rejected": -4.683333333333334, "step": 450 }, { "epoch": 0.3092218032224889, "grad_norm": 0.2821527262893935, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117621004.59016393, "logits/rejected": 98096632.35820895, "logps/chosen": -210.2295081967213, "logps/rejected": -325.25373134328356, "loss": 0.1746, "rewards/chosen": 1.3186475409836065, "rewards/margins": 7.363423660386592, "rewards/rejected": -6.044776119402985, "step": 451 }, { "epoch": 0.30990743914981145, "grad_norm": 0.31524120303701486, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145365746.52631578, "logits/rejected": 129297486.76923077, "logps/chosen": -266.42105263157896, "logps/rejected": -378.15384615384613, "loss": 0.2245, "rewards/chosen": 1.0888157894736843, "rewards/margins": 6.800354251012146, "rewards/rejected": -5.711538461538462, "step": 452 }, { "epoch": 0.310593075077134, "grad_norm": 0.23523392719489, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118334380.06557377, "logits/rejected": 114247832.8358209, "logps/chosen": -244.19672131147541, "logps/rejected": -365.85074626865674, "loss": 0.1863, "rewards/chosen": 1.0573770491803278, "rewards/margins": 6.90066063126988, "rewards/rejected": -5.843283582089552, "step": 453 }, { "epoch": 0.31127871100445664, "grad_norm": 0.2071857391039069, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131167325.0909091, "logits/rejected": 99919145.29032259, "logps/chosen": -267.1515151515151, "logps/rejected": -313.2903225806452, "loss": 0.1912, "rewards/chosen": 1.0994318181818181, "rewards/margins": 8.147818914956012, "rewards/rejected": -7.048387096774194, "step": 454 }, { "epoch": 0.3119643469317792, "grad_norm": 0.2020291157174289, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129433600.0, "logits/rejected": 89186304.0, "logps/chosen": -241.25, "logps/rejected": -352.0, "loss": 0.2079, "rewards/chosen": 0.7890625, "rewards/margins": 6.8125, "rewards/rejected": -6.0234375, "step": 455 }, { "epoch": 0.31264998285910184, "grad_norm": 0.17827335313330947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133623013.25373134, "logits/rejected": 123903865.70491803, "logps/chosen": -267.94029850746267, "logps/rejected": -358.2950819672131, "loss": 0.2015, "rewards/chosen": 1.0354477611940298, "rewards/margins": 7.73216907266944, "rewards/rejected": -6.69672131147541, "step": 456 }, { "epoch": 0.3133356187864244, "grad_norm": 0.2147785026269578, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118139562.66666667, "logits/rejected": 112752760.47058824, "logps/chosen": -241.33333333333334, "logps/rejected": -344.70588235294116, "loss": 0.1881, "rewards/chosen": 0.8440104166666667, "rewards/margins": 7.270481004901961, "rewards/rejected": -6.426470588235294, "step": 457 }, { "epoch": 0.314021254713747, "grad_norm": 0.17715789419310005, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154210577.06666666, "logits/rejected": 57424956.23529412, "logps/chosen": -302.93333333333334, "logps/rejected": -329.1764705882353, "loss": 0.1607, "rewards/chosen": 1.6338541666666666, "rewards/margins": 7.575030637254902, "rewards/rejected": -5.9411764705882355, "step": 458 }, { "epoch": 0.3147068906410696, "grad_norm": 0.20434634163504767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105507421.74647887, "logits/rejected": 127153637.05263157, "logps/chosen": -231.43661971830986, "logps/rejected": -329.82456140350877, "loss": 0.2125, "rewards/chosen": 1.0070422535211268, "rewards/margins": 7.349147516679022, "rewards/rejected": -6.342105263157895, "step": 459 }, { "epoch": 0.3153925265683922, "grad_norm": 0.21088349367416248, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128974848.0, "logits/rejected": 130285568.0, "logps/chosen": -252.875, "logps/rejected": -403.0, "loss": 0.1888, "rewards/chosen": 1.21484375, "rewards/margins": 7.38671875, "rewards/rejected": -6.171875, "step": 460 }, { "epoch": 0.3160781624957148, "grad_norm": 0.19998754920799794, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154723214.2222222, "logits/rejected": 112112612.32432432, "logps/chosen": -220.44444444444446, "logps/rejected": -398.27027027027026, "loss": 0.1954, "rewards/chosen": 0.7722077546296297, "rewards/margins": 7.06274829517017, "rewards/rejected": -6.29054054054054, "step": 461 }, { "epoch": 0.31676379842303737, "grad_norm": 0.17837911461903522, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 187392971.93220338, "logits/rejected": 38789713.623188406, "logps/chosen": -296.6779661016949, "logps/rejected": -335.536231884058, "loss": 0.1847, "rewards/chosen": 1.2076271186440677, "rewards/margins": 7.00472856791943, "rewards/rejected": -5.797101449275362, "step": 462 }, { "epoch": 0.31744943435035994, "grad_norm": 0.17021286930231114, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 65896448.0, "logits/rejected": 170917888.0, "logps/chosen": -148.25, "logps/rejected": -386.25, "loss": 0.1744, "rewards/chosen": 1.22119140625, "rewards/margins": 8.04931640625, "rewards/rejected": -6.828125, "step": 463 }, { "epoch": 0.31813507027768256, "grad_norm": 0.18618748524555667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125015299.82089552, "logits/rejected": 109326940.32786885, "logps/chosen": -241.43283582089552, "logps/rejected": -353.04918032786884, "loss": 0.1953, "rewards/chosen": 1.25, "rewards/margins": 7.979508196721311, "rewards/rejected": -6.729508196721311, "step": 464 }, { "epoch": 0.31882070620500513, "grad_norm": 0.16685041841742793, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105060550.19354838, "logits/rejected": 115470460.12121212, "logps/chosen": -212.1290322580645, "logps/rejected": -326.3030303030303, "loss": 0.1862, "rewards/chosen": 1.0735887096774193, "rewards/margins": 7.376619012707722, "rewards/rejected": -6.303030303030303, "step": 465 }, { "epoch": 0.31950634213232776, "grad_norm": 0.2649699331291284, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156465775.3043478, "logits/rejected": 102511633.3559322, "logps/chosen": -311.8840579710145, "logps/rejected": -314.03389830508473, "loss": 0.2135, "rewards/chosen": 0.8858695652173914, "rewards/margins": 7.521462785556374, "rewards/rejected": -6.635593220338983, "step": 466 }, { "epoch": 0.32019197805965033, "grad_norm": 0.1941377222025352, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135637338.58461538, "logits/rejected": 67308592.76190476, "logps/chosen": -281.6, "logps/rejected": -379.93650793650795, "loss": 0.1961, "rewards/chosen": 1.3442307692307693, "rewards/margins": 7.836294261294261, "rewards/rejected": -6.492063492063492, "step": 467 }, { "epoch": 0.3208776139869729, "grad_norm": 0.18787441889087111, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 181017330.52631578, "logits/rejected": 22020096.0, "logps/chosen": -279.0175438596491, "logps/rejected": -317.2957746478873, "loss": 0.1847, "rewards/chosen": 1.1282894736842106, "rewards/margins": 6.388852853965901, "rewards/rejected": -5.26056338028169, "step": 468 }, { "epoch": 0.3215632499142955, "grad_norm": 0.25233439998636553, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105709568.0, "logits/rejected": 76283904.0, "logps/chosen": -162.375, "logps/rejected": -354.25, "loss": 0.1754, "rewards/chosen": 1.3916015625, "rewards/margins": 7.7666015625, "rewards/rejected": -6.375, "step": 469 }, { "epoch": 0.3222488858416181, "grad_norm": 0.209849822616892, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 163255217.23076922, "logits/rejected": 94738009.3968254, "logps/chosen": -241.47692307692307, "logps/rejected": -383.4920634920635, "loss": 0.1839, "rewards/chosen": 1.1923076923076923, "rewards/margins": 8.224053724053723, "rewards/rejected": -7.031746031746032, "step": 470 }, { "epoch": 0.3229345217689407, "grad_norm": 0.1951013032858324, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152232607.47540984, "logits/rejected": 98785249.43283582, "logps/chosen": -305.04918032786884, "logps/rejected": -335.76119402985074, "loss": 0.1758, "rewards/chosen": 1.6577868852459017, "rewards/margins": 7.433906288230975, "rewards/rejected": -5.776119402985074, "step": 471 }, { "epoch": 0.3236201576962633, "grad_norm": 0.21125076233637527, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115403278.62857144, "logits/rejected": 114837150.89655173, "logps/chosen": -239.0857142857143, "logps/rejected": -318.3448275862069, "loss": 0.2284, "rewards/chosen": 0.5861607142857143, "rewards/margins": 6.879264162561577, "rewards/rejected": -6.293103448275862, "step": 472 }, { "epoch": 0.32430579362358586, "grad_norm": 0.19967048137428953, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118194176.0, "logits/rejected": 127860736.0, "logps/chosen": -260.75, "logps/rejected": -430.5, "loss": 0.2188, "rewards/chosen": 0.6337890625, "rewards/margins": 5.3525390625, "rewards/rejected": -4.71875, "step": 473 }, { "epoch": 0.3249914295509085, "grad_norm": 0.2082885483049421, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 23787695.542857144, "logits/rejected": 166615110.62068966, "logps/chosen": -181.71428571428572, "logps/rejected": -337.6551724137931, "loss": 0.1855, "rewards/chosen": 1.3678571428571429, "rewards/margins": 7.329064039408867, "rewards/rejected": -5.961206896551724, "step": 474 }, { "epoch": 0.32567706547823105, "grad_norm": 0.25989172180340847, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98566144.0, "logits/rejected": 146366746.4827586, "logps/chosen": -270.4, "logps/rejected": -337.1034482758621, "loss": 0.2282, "rewards/chosen": 0.5839285714285715, "rewards/margins": -7294995.278140394, "rewards/rejected": 7294995.862068965, "step": 475 }, { "epoch": 0.3263627014055536, "grad_norm": 0.23049159363226615, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88625098.80519481, "logits/rejected": 90424259.76470588, "logps/chosen": -214.44155844155844, "logps/rejected": -298.6666666666667, "loss": 0.2488, "rewards/chosen": 0.9620789366883117, "rewards/margins": 6.516000505315763, "rewards/rejected": -5.553921568627451, "step": 476 }, { "epoch": 0.32704833733287625, "grad_norm": 0.18772711835789896, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 228116017.5483871, "logits/rejected": 30504029.09090909, "logps/chosen": -288.0, "logps/rejected": -290.1818181818182, "loss": 0.1893, "rewards/chosen": 1.1350806451612903, "rewards/margins": 7.88508064516129, "rewards/rejected": -6.75, "step": 477 }, { "epoch": 0.3277339732601988, "grad_norm": 0.20759208954949593, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143700502.26086956, "logits/rejected": 92665682.44067797, "logps/chosen": -269.4492753623188, "logps/rejected": -366.3728813559322, "loss": 0.2153, "rewards/chosen": 1.3858695652173914, "rewards/margins": 6.5977339719970525, "rewards/rejected": -5.211864406779661, "step": 478 }, { "epoch": 0.32841960918752144, "grad_norm": 0.1677455460711525, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120011214.4516129, "logits/rejected": 132692526.54545455, "logps/chosen": -240.7741935483871, "logps/rejected": -446.54545454545456, "loss": 0.1873, "rewards/chosen": 1.1461693548387097, "rewards/margins": 8.191623900293255, "rewards/rejected": -7.045454545454546, "step": 479 }, { "epoch": 0.329105245114844, "grad_norm": 0.27125465435708385, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73335792.24615385, "logits/rejected": 126228577.52380952, "logps/chosen": -205.04615384615386, "logps/rejected": -395.6825396825397, "loss": 0.1924, "rewards/chosen": 1.1009615384615385, "rewards/margins": 6.648580586080586, "rewards/rejected": -5.5476190476190474, "step": 480 }, { "epoch": 0.3297908810421666, "grad_norm": 0.21067983393118037, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108527616.0, "logits/rejected": 148834241.93939394, "logps/chosen": -248.0, "logps/rejected": -379.1515151515151, "loss": 0.1689, "rewards/chosen": 1.314516129032258, "rewards/margins": 8.15542521994135, "rewards/rejected": -6.840909090909091, "step": 481 }, { "epoch": 0.3304765169694892, "grad_norm": 0.17236638986452346, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 180355072.0, "logits/rejected": 56832819.2, "logps/chosen": -322.7586206896552, "logps/rejected": -310.85714285714283, "loss": 0.1823, "rewards/chosen": 0.7640086206896551, "rewards/margins": 6.506865763546798, "rewards/rejected": -5.742857142857143, "step": 482 }, { "epoch": 0.3311621528968118, "grad_norm": 0.3490859562992475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115360272.51612903, "logits/rejected": 116455486.06060606, "logps/chosen": -197.29032258064515, "logps/rejected": -328.4848484848485, "loss": 0.1994, "rewards/chosen": 0.9440524193548387, "rewards/margins": 7.141022116324536, "rewards/rejected": -6.196969696969697, "step": 483 }, { "epoch": 0.3318477888241344, "grad_norm": 0.3114936287454587, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130809856.0, "logits/rejected": 120717312.0, "logps/chosen": -204.5, "logps/rejected": -338.0, "loss": 0.1653, "rewards/chosen": 1.4921875, "rewards/margins": 5.8515625, "rewards/rejected": -4.359375, "step": 484 }, { "epoch": 0.332533424751457, "grad_norm": 0.29184205078621395, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102633347.87878788, "logits/rejected": 124104043.35483871, "logps/chosen": -199.5151515151515, "logps/rejected": -325.93548387096774, "loss": 0.2054, "rewards/chosen": 1.277462121212121, "rewards/margins": 6.051655669599218, "rewards/rejected": -4.774193548387097, "step": 485 }, { "epoch": 0.33321906067877954, "grad_norm": 0.16731955746581265, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106892150.4477612, "logits/rejected": 97294100.98360656, "logps/chosen": -325.25373134328356, "logps/rejected": -318.1639344262295, "loss": 0.1807, "rewards/chosen": 2.0578358208955225, "rewards/margins": 7.758655493026669, "rewards/rejected": -5.700819672131147, "step": 486 }, { "epoch": 0.33390469660610217, "grad_norm": 0.32440878994368055, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 171622668.59016395, "logits/rejected": 62061613.85074627, "logps/chosen": -243.9344262295082, "logps/rejected": -284.8955223880597, "loss": 0.1883, "rewards/chosen": 1.0655737704918034, "rewards/margins": 7.177514068999266, "rewards/rejected": -6.111940298507463, "step": 487 }, { "epoch": 0.33459033253342474, "grad_norm": 0.18620647187971123, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126379192.6557377, "logits/rejected": 69714653.6119403, "logps/chosen": -263.8688524590164, "logps/rejected": -295.8805970149254, "loss": 0.2147, "rewards/chosen": 0.6280737704918032, "rewards/margins": 5.97882003914852, "rewards/rejected": -5.350746268656716, "step": 488 }, { "epoch": 0.33527596846074736, "grad_norm": 0.17514337422231557, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101741831.31428571, "logits/rejected": 96685938.7586207, "logps/chosen": -210.5142857142857, "logps/rejected": -296.0, "loss": 0.1989, "rewards/chosen": 1.2803571428571427, "rewards/margins": 6.625184729064039, "rewards/rejected": -5.344827586206897, "step": 489 }, { "epoch": 0.33596160438806993, "grad_norm": 0.1986319678536007, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161727427.7647059, "logits/rejected": 58650350.93333333, "logps/chosen": -234.58823529411765, "logps/rejected": -334.93333333333334, "loss": 0.215, "rewards/chosen": 0.6415441176470589, "rewards/margins": 6.441544117647059, "rewards/rejected": -5.8, "step": 490 }, { "epoch": 0.3366472403153925, "grad_norm": 0.1847161034447233, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153223168.0, "logits/rejected": 111673344.0, "logps/chosen": -303.75, "logps/rejected": -391.0, "loss": 0.1717, "rewards/chosen": 1.298828125, "rewards/margins": 7.728515625, "rewards/rejected": -6.4296875, "step": 491 }, { "epoch": 0.33733287624271513, "grad_norm": 0.18021990201525728, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143931171.44615385, "logits/rejected": 147266673.7777778, "logps/chosen": -220.55384615384617, "logps/rejected": -481.015873015873, "loss": 0.1956, "rewards/chosen": 0.8288461538461539, "rewards/margins": 7.543131868131868, "rewards/rejected": -6.714285714285714, "step": 492 }, { "epoch": 0.3380185121700377, "grad_norm": 0.2713716617614279, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142827088.84210527, "logits/rejected": 93101733.85915492, "logps/chosen": -219.50877192982455, "logps/rejected": -344.7887323943662, "loss": 0.1898, "rewards/chosen": 0.6271929824561403, "rewards/margins": 4.7328267852730415, "rewards/rejected": -4.105633802816901, "step": 493 }, { "epoch": 0.3387041480973603, "grad_norm": 0.21764582550931724, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 194710225.12676057, "logits/rejected": 22479997.754385963, "logps/chosen": -256.67605633802816, "logps/rejected": -344.70175438596493, "loss": 0.2244, "rewards/chosen": 0.971830985915493, "rewards/margins": 4.002532740301458, "rewards/rejected": -3.030701754385965, "step": 494 }, { "epoch": 0.3393897840246829, "grad_norm": 0.18895775973185844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149391239.52941176, "logits/rejected": 81753975.46666667, "logps/chosen": -192.23529411764707, "logps/rejected": -377.06666666666666, "loss": 0.196, "rewards/chosen": 1.052389705882353, "rewards/margins": 5234403.185723039, "rewards/rejected": -5234402.133333334, "step": 495 }, { "epoch": 0.34007541995200546, "grad_norm": 0.18088607432549078, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123768760.14035088, "logits/rejected": 132091038.64788732, "logps/chosen": -270.5964912280702, "logps/rejected": -410.59154929577466, "loss": 0.1841, "rewards/chosen": 1.1091008771929824, "rewards/margins": 7.735861440573265, "rewards/rejected": -6.626760563380282, "step": 496 }, { "epoch": 0.3407610558793281, "grad_norm": 0.22318240420329732, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150994944.0, "logits/rejected": 138949763.2820513, "logps/chosen": -272.32, "logps/rejected": -405.3333333333333, "loss": 0.1387, "rewards/chosen": 1.47375, "rewards/margins": 8.595544871794871, "rewards/rejected": -7.121794871794871, "step": 497 }, { "epoch": 0.34144669180665066, "grad_norm": 0.18072143905253985, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111149056.0, "logits/rejected": 93125706.20289855, "logps/chosen": -234.03389830508473, "logps/rejected": -383.30434782608694, "loss": 0.1919, "rewards/chosen": 0.9782838983050848, "rewards/margins": 4825099.181182449, "rewards/rejected": -4825098.202898551, "step": 498 }, { "epoch": 0.3421323277339733, "grad_norm": 0.24625139198320012, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129734161.65517241, "logits/rejected": 134277646.62857142, "logps/chosen": -235.0344827586207, "logps/rejected": -364.8, "loss": 0.1869, "rewards/chosen": 0.9089776400862069, "rewards/margins": 7.287549068657636, "rewards/rejected": -6.378571428571429, "step": 499 }, { "epoch": 0.34281796366129585, "grad_norm": 0.21472258432812702, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117511601.89830509, "logits/rejected": 76348490.20289855, "logps/chosen": -203.79661016949152, "logps/rejected": -347.82608695652175, "loss": 0.1796, "rewards/chosen": 1.0434322033898304, "rewards/margins": 7.833287275853599, "rewards/rejected": -6.7898550724637685, "step": 500 }, { "epoch": 0.3435035995886184, "grad_norm": 0.19119531465668085, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120324096.0, "logits/rejected": 111706112.0, "logps/chosen": -191.75, "logps/rejected": -371.5, "loss": 0.1733, "rewards/chosen": 1.1220703125, "rewards/margins": 8.5986328125, "rewards/rejected": -7.4765625, "step": 501 }, { "epoch": 0.34418923551594105, "grad_norm": 0.16374606006472595, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81633583.4074074, "logits/rejected": 103270566.05405405, "logps/chosen": -228.44444444444446, "logps/rejected": -343.7837837837838, "loss": 0.1792, "rewards/chosen": 0.859375, "rewards/margins": 7.393158783783784, "rewards/rejected": -6.533783783783784, "step": 502 }, { "epoch": 0.3448748714432636, "grad_norm": 0.19090495003316943, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 183617308.44444445, "logits/rejected": 82111566.76923077, "logps/chosen": -319.4920634920635, "logps/rejected": -331.32307692307694, "loss": 0.1628, "rewards/chosen": 1.8115079365079365, "rewards/margins": 7.926892551892552, "rewards/rejected": -6.115384615384615, "step": 503 }, { "epoch": 0.34556050737058625, "grad_norm": 0.29964201996095613, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117378831.05882353, "logits/rejected": 103040068.26666667, "logps/chosen": -230.11764705882354, "logps/rejected": -369.3333333333333, "loss": 0.2215, "rewards/chosen": 0.9292279411764706, "rewards/margins": 7.37922794117647, "rewards/rejected": -6.45, "step": 504 }, { "epoch": 0.3462461432979088, "grad_norm": 0.17939255713517854, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129878792.8275862, "logits/rejected": 92035013.48571429, "logps/chosen": -218.48275862068965, "logps/rejected": -373.0285714285714, "loss": 0.1691, "rewards/chosen": 1.0484913793103448, "rewards/margins": 8.269919950738917, "rewards/rejected": -7.2214285714285715, "step": 505 }, { "epoch": 0.3469317792252314, "grad_norm": 0.19970813040734706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130023424.0, "logits/rejected": 111057305.6, "logps/chosen": -213.88235294117646, "logps/rejected": -361.3333333333333, "loss": 0.1863, "rewards/chosen": 1.4797794117647058, "rewards/margins": 6.704779411764705, "rewards/rejected": -5.225, "step": 506 }, { "epoch": 0.347617415152554, "grad_norm": 0.21781559692359947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110932109.2413793, "logits/rejected": 95480334.62857144, "logps/chosen": -290.2068965517241, "logps/rejected": -293.9428571428571, "loss": 0.1467, "rewards/chosen": 1.3038793103448276, "rewards/margins": 8.253879310344828, "rewards/rejected": -6.95, "step": 507 }, { "epoch": 0.3483030510798766, "grad_norm": 0.158174162450245, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140853492.53731343, "logits/rejected": 81857687.08196722, "logps/chosen": -279.1641791044776, "logps/rejected": -316.327868852459, "loss": 0.1847, "rewards/chosen": 1.330223880597015, "rewards/margins": 7.715469782236359, "rewards/rejected": -6.385245901639344, "step": 508 }, { "epoch": 0.34898868700719915, "grad_norm": 0.19514288506036018, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141779290.14084506, "logits/rejected": 103238745.8245614, "logps/chosen": -263.88732394366195, "logps/rejected": -395.50877192982455, "loss": 0.2317, "rewards/chosen": 0.6144366197183099, "rewards/margins": 5.395138374104275, "rewards/rejected": -4.780701754385965, "step": 509 }, { "epoch": 0.3496743229345218, "grad_norm": 0.29525030392401413, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123667440.24615385, "logits/rejected": 112247564.1904762, "logps/chosen": -231.13846153846154, "logps/rejected": -324.57142857142856, "loss": 0.2025, "rewards/chosen": 0.90625, "rewards/margins": 7.509424603174603, "rewards/rejected": -6.603174603174603, "step": 510 }, { "epoch": 0.35035995886184435, "grad_norm": 0.16151092541290843, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102534600.86153845, "logits/rejected": 181620020.82539684, "logps/chosen": -218.09230769230768, "logps/rejected": -361.6507936507937, "loss": 0.1947, "rewards/chosen": 0.8951923076923077, "rewards/margins": 6.712652625152625, "rewards/rejected": -5.817460317460317, "step": 511 }, { "epoch": 0.35104559478916697, "grad_norm": 0.18216645848817853, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129157932.6984127, "logits/rejected": 67237919.5076923, "logps/chosen": -182.85714285714286, "logps/rejected": -281.6, "loss": 0.1702, "rewards/chosen": 1.1587301587301588, "rewards/margins": 7.766422466422466, "rewards/rejected": -6.607692307692307, "step": 512 }, { "epoch": 0.35173123071648954, "grad_norm": 0.21365461675685643, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131538033.77777778, "logits/rejected": 107628836.57142857, "logps/chosen": -212.88888888888889, "logps/rejected": -354.57142857142856, "loss": 0.1953, "rewards/chosen": 1.2717013888888888, "rewards/margins": 6.575272817460318, "rewards/rejected": -5.303571428571429, "step": 513 }, { "epoch": 0.3524168666438121, "grad_norm": 0.1782147529062245, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118435770.57627119, "logits/rejected": 104918387.01449275, "logps/chosen": -227.9322033898305, "logps/rejected": -347.3623188405797, "loss": 0.1558, "rewards/chosen": 0.6779661016949152, "rewards/margins": 7.257676246622451, "rewards/rejected": -6.579710144927536, "step": 514 }, { "epoch": 0.35310250257113474, "grad_norm": 0.18763133561946885, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119804574.25454545, "logits/rejected": 96066798.46575342, "logps/chosen": -164.5090909090909, "logps/rejected": -376.54794520547944, "loss": 0.1649, "rewards/chosen": 1.1318181818181818, "rewards/margins": 7.350996264009963, "rewards/rejected": -6.219178082191781, "step": 515 }, { "epoch": 0.3537881384984573, "grad_norm": 0.23070594476573497, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97272900.26666667, "logits/rejected": 88943917.1764706, "logps/chosen": -213.2, "logps/rejected": -272.0, "loss": 0.1771, "rewards/chosen": 1.35625, "rewards/margins": 6.988602941176471, "rewards/rejected": -5.632352941176471, "step": 516 }, { "epoch": 0.35447377442577993, "grad_norm": 0.20261612875377655, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 194565624.35820895, "logits/rejected": 76013165.1147541, "logps/chosen": -256.95522388059703, "logps/rejected": -282.4918032786885, "loss": 0.2123, "rewards/chosen": 0.871268656716418, "rewards/margins": 7.289301443601664, "rewards/rejected": -6.418032786885246, "step": 517 }, { "epoch": 0.3551594103531025, "grad_norm": 0.26696646365970006, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135591724.13793105, "logits/rejected": 120196768.91428572, "logps/chosen": -283.58620689655174, "logps/rejected": -368.45714285714286, "loss": 0.1677, "rewards/chosen": 1.150323275862069, "rewards/margins": 7.914608990147784, "rewards/rejected": -6.764285714285714, "step": 518 }, { "epoch": 0.35584504628042507, "grad_norm": 0.2187812398029604, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160540601.37931034, "logits/rejected": 88080384.0, "logps/chosen": -273.37931034482756, "logps/rejected": -359.3142857142857, "loss": 0.187, "rewards/chosen": 1.1260775862068966, "rewards/margins": 7.054649014778326, "rewards/rejected": -5.928571428571429, "step": 519 }, { "epoch": 0.3565306822077477, "grad_norm": 0.21502760176329425, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131879919.21311475, "logits/rejected": 119944574.08955224, "logps/chosen": -257.8360655737705, "logps/rejected": -374.92537313432837, "loss": 0.1948, "rewards/chosen": 0.9195696721311475, "rewards/margins": 7.330017433325177, "rewards/rejected": -6.41044776119403, "step": 520 }, { "epoch": 0.35721631813507027, "grad_norm": 0.207367245748051, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137930718.4262295, "logits/rejected": 90271438.3283582, "logps/chosen": -210.88524590163934, "logps/rejected": -361.55223880597015, "loss": 0.1988, "rewards/chosen": 0.9738729508196722, "rewards/margins": 7.839544592610717, "rewards/rejected": -6.865671641791045, "step": 521 }, { "epoch": 0.3579019540623929, "grad_norm": 0.2950341947204693, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129515023.51515152, "logits/rejected": 74398158.4516129, "logps/chosen": -236.36363636363637, "logps/rejected": -325.93548387096774, "loss": 0.1915, "rewards/chosen": 1.4640151515151516, "rewards/margins": 7.165628054740957, "rewards/rejected": -5.701612903225806, "step": 522 }, { "epoch": 0.35858758998971546, "grad_norm": 0.19722291912677187, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 48103424.0, "logits/rejected": 132644864.0, "logps/chosen": -225.75, "logps/rejected": -334.75, "loss": 0.2074, "rewards/chosen": 1.0546875, "rewards/margins": 8.3515625, "rewards/rejected": -7.296875, "step": 523 }, { "epoch": 0.35927322591703803, "grad_norm": 0.14796842166380264, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124151398.4, "logits/rejected": 126561458.79365079, "logps/chosen": -285.9076923076923, "logps/rejected": -392.12698412698415, "loss": 0.1737, "rewards/chosen": 1.728846153846154, "rewards/margins": 6.7209096459096465, "rewards/rejected": -4.992063492063492, "step": 524 }, { "epoch": 0.35995886184436066, "grad_norm": 0.19236169373813966, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122683392.0, "logits/rejected": 88145920.0, "logps/chosen": -254.5, "logps/rejected": -362.5, "loss": 0.1774, "rewards/chosen": 1.1953125, "rewards/margins": 7.9375, "rewards/rejected": -6.7421875, "step": 525 }, { "epoch": 0.3606444977716832, "grad_norm": 0.25577733397943286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143305386.66666666, "logits/rejected": 53107290.35294118, "logps/chosen": -204.8, "logps/rejected": -332.2352941176471, "loss": 0.1774, "rewards/chosen": 1.4197916666666666, "rewards/margins": 7.449203431372549, "rewards/rejected": -6.029411764705882, "step": 526 }, { "epoch": 0.36133013369900585, "grad_norm": 0.19078604567355084, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 192804831.4920635, "logits/rejected": 106148155.07692307, "logps/chosen": -280.8888888888889, "logps/rejected": -347.0769230769231, "loss": 0.1737, "rewards/chosen": 1.0406746031746033, "rewards/margins": 6.356059218559219, "rewards/rejected": -5.315384615384615, "step": 527 }, { "epoch": 0.3620157696263284, "grad_norm": 0.26536835934708114, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110782886.6031746, "logits/rejected": 120279733.16923077, "logps/chosen": -233.3968253968254, "logps/rejected": -336.73846153846154, "loss": 0.1822, "rewards/chosen": 1.0228174603174602, "rewards/margins": 7.49204822954823, "rewards/rejected": -6.469230769230769, "step": 528 }, { "epoch": 0.362701405553651, "grad_norm": 0.1952783940109573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117956734.03076923, "logits/rejected": 76288065.01587301, "logps/chosen": -254.27692307692308, "logps/rejected": -339.04761904761904, "loss": 0.1745, "rewards/chosen": 1.85, "rewards/margins": 7.643650793650794, "rewards/rejected": -5.7936507936507935, "step": 529 }, { "epoch": 0.3633870414809736, "grad_norm": 0.23757148646967732, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131932771.34328358, "logits/rejected": 123740562.8852459, "logps/chosen": -230.6865671641791, "logps/rejected": -370.0983606557377, "loss": 0.1979, "rewards/chosen": 1.1119402985074627, "rewards/margins": 7.7103009542451675, "rewards/rejected": -6.598360655737705, "step": 530 }, { "epoch": 0.3640726774082962, "grad_norm": 0.20755382446793866, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105499400.8275862, "logits/rejected": 121822061.71428572, "logps/chosen": -237.51724137931035, "logps/rejected": -360.22857142857146, "loss": 0.2019, "rewards/chosen": 0.9391163793103449, "rewards/margins": 6.789116379310345, "rewards/rejected": -5.85, "step": 531 }, { "epoch": 0.3647583133356188, "grad_norm": 0.24719800889961918, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134460876.057971, "logits/rejected": 103293622.23728813, "logps/chosen": -259.71014492753625, "logps/rejected": -333.2881355932203, "loss": 0.1972, "rewards/chosen": 0.6829710144927537, "rewards/margins": 6.860937116187669, "rewards/rejected": -6.177966101694915, "step": 532 }, { "epoch": 0.3654439492629414, "grad_norm": 0.1853256932807826, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154421276.84507042, "logits/rejected": 59124969.543859646, "logps/chosen": -285.7464788732394, "logps/rejected": -279.0175438596491, "loss": 0.183, "rewards/chosen": 1.1518485915492958, "rewards/margins": 7.686936310847542, "rewards/rejected": -6.535087719298246, "step": 533 }, { "epoch": 0.36612958519026395, "grad_norm": 0.20015676420750736, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 172953359.05882353, "logits/rejected": 101082726.4, "logps/chosen": -252.0, "logps/rejected": -385.3333333333333, "loss": 0.1796, "rewards/chosen": 1.411764705882353, "rewards/margins": 5.311764705882353, "rewards/rejected": -3.9, "step": 534 }, { "epoch": 0.3668152211175866, "grad_norm": 0.1945999447016944, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151370553.31343284, "logits/rejected": 117853066.49180327, "logps/chosen": -287.5223880597015, "logps/rejected": -375.08196721311475, "loss": 0.2069, "rewards/chosen": 1.4440298507462686, "rewards/margins": 8.058783949106925, "rewards/rejected": -6.614754098360656, "step": 535 }, { "epoch": 0.36750085704490915, "grad_norm": 0.27876584165083584, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77839855.48387097, "logits/rejected": 128498222.54545455, "logps/chosen": -212.1290322580645, "logps/rejected": -368.4848484848485, "loss": 0.1893, "rewards/chosen": 0.8644153225806451, "rewards/margins": 7.311385019550342, "rewards/rejected": -6.446969696969697, "step": 536 }, { "epoch": 0.36818649297223177, "grad_norm": 0.17983151955381185, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119021970.8852459, "logits/rejected": 100099882.02985075, "logps/chosen": -256.0, "logps/rejected": -341.4925373134328, "loss": 0.1711, "rewards/chosen": 1.1905737704918034, "rewards/margins": 8.138334964521654, "rewards/rejected": -6.947761194029851, "step": 537 }, { "epoch": 0.36887212889955434, "grad_norm": 0.20829575726946062, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108932066.74285714, "logits/rejected": 111655265.10344827, "logps/chosen": -270.62857142857143, "logps/rejected": -382.62068965517244, "loss": 0.2002, "rewards/chosen": 1.1285714285714286, "rewards/margins": 7.99064039408867, "rewards/rejected": -6.862068965517241, "step": 538 }, { "epoch": 0.3695577648268769, "grad_norm": 0.27776663676776275, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102147434.33846153, "logits/rejected": 150196028.95238096, "logps/chosen": -240.4923076923077, "logps/rejected": -399.74603174603175, "loss": 0.1867, "rewards/chosen": 1.1375, "rewards/margins": 7.804166666666667, "rewards/rejected": -6.666666666666667, "step": 539 }, { "epoch": 0.37024340075419954, "grad_norm": 0.18237663301951357, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 58645357.71428572, "logits/rejected": 154140672.0, "logps/chosen": -256.2857142857143, "logps/rejected": -404.0, "loss": 0.1801, "rewards/chosen": 1.0915178571428572, "rewards/margins": 7.36235119047619, "rewards/rejected": -6.270833333333333, "step": 540 }, { "epoch": 0.3709290366815221, "grad_norm": 0.159260422240138, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97217974.85714285, "logits/rejected": 94954382.22222222, "logps/chosen": -194.28571428571428, "logps/rejected": -376.0, "loss": 0.1652, "rewards/chosen": 1.1350446428571428, "rewards/margins": 8.357266865079366, "rewards/rejected": -7.222222222222222, "step": 541 }, { "epoch": 0.3716146726088447, "grad_norm": 0.19337307529717745, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157163038.11764705, "logits/rejected": 113525828.26666667, "logps/chosen": -239.05882352941177, "logps/rejected": -383.46666666666664, "loss": 0.2156, "rewards/chosen": 1.228860294117647, "rewards/margins": 7.27469362745098, "rewards/rejected": -6.045833333333333, "step": 542 }, { "epoch": 0.3723003085361673, "grad_norm": 0.1686882823363199, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133618541.71428572, "logits/rejected": 78751673.37931034, "logps/chosen": -257.37142857142857, "logps/rejected": -344.0, "loss": 0.1763, "rewards/chosen": 1.542857142857143, "rewards/margins": 7.948029556650246, "rewards/rejected": -6.405172413793103, "step": 543 }, { "epoch": 0.37298594446348987, "grad_norm": 0.24257844924167196, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121896960.0, "logits/rejected": 69562368.0, "logps/chosen": -206.25, "logps/rejected": -346.5, "loss": 0.1727, "rewards/chosen": 1.6103515625, "rewards/margins": 7.7900390625, "rewards/rejected": -6.1796875, "step": 544 }, { "epoch": 0.3736715803908125, "grad_norm": 0.18915581091878161, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143568963.147541, "logits/rejected": 85075509.49253732, "logps/chosen": -299.27868852459017, "logps/rejected": -359.1641791044776, "loss": 0.1726, "rewards/chosen": 1.447233606557377, "rewards/margins": 8.312905248348422, "rewards/rejected": -6.865671641791045, "step": 545 }, { "epoch": 0.37435721631813507, "grad_norm": 0.18532964390667658, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102191728.81355932, "logits/rejected": 66284440.11594203, "logps/chosen": -157.42372881355934, "logps/rejected": -312.57971014492756, "loss": 0.1962, "rewards/chosen": 0.7748940677966102, "rewards/margins": 6.9125752272169, "rewards/rejected": -6.13768115942029, "step": 546 }, { "epoch": 0.37504285224545764, "grad_norm": 0.22162793346309984, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96727103.01538461, "logits/rejected": 110583157.84126984, "logps/chosen": -219.07692307692307, "logps/rejected": -337.26984126984127, "loss": 0.1914, "rewards/chosen": 1.2807692307692307, "rewards/margins": 7.915689865689866, "rewards/rejected": -6.634920634920635, "step": 547 }, { "epoch": 0.37572848817278026, "grad_norm": 0.16968516120123406, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106255701.33333333, "logits/rejected": 119907749.64705883, "logps/chosen": -259.2, "logps/rejected": -360.2352941176471, "loss": 0.1568, "rewards/chosen": 1.4145833333333333, "rewards/margins": 8.098406862745097, "rewards/rejected": -6.6838235294117645, "step": 548 }, { "epoch": 0.37641412410010283, "grad_norm": 0.2038700905943399, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147639500.8, "logits/rejected": 62548390.603174604, "logps/chosen": -281.6, "logps/rejected": -313.3968253968254, "loss": 0.1946, "rewards/chosen": 1.4846153846153847, "rewards/margins": 6.2465201465201465, "rewards/rejected": -4.761904761904762, "step": 549 }, { "epoch": 0.37709976002742546, "grad_norm": 0.20569597350439872, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123183482.09230769, "logits/rejected": 156587349.33333334, "logps/chosen": -293.4153846153846, "logps/rejected": -421.58730158730157, "loss": 0.1894, "rewards/chosen": 1.3509615384615385, "rewards/margins": 5.890644078144078, "rewards/rejected": -4.5396825396825395, "step": 550 }, { "epoch": 0.377785395954748, "grad_norm": 0.20392587783149752, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173454765.41935483, "logits/rejected": 37113235.39393939, "logps/chosen": -340.38709677419354, "logps/rejected": -341.8181818181818, "loss": 0.1845, "rewards/chosen": 1.5, "rewards/margins": 8.098484848484848, "rewards/rejected": -6.598484848484849, "step": 551 }, { "epoch": 0.3784710318820706, "grad_norm": 0.20345707336564944, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150883095.89333335, "logits/rejected": 86161292.0754717, "logps/chosen": -271.36, "logps/rejected": -353.50943396226415, "loss": 0.2149, "rewards/chosen": 1.2225, "rewards/margins": 6.637594339622642, "rewards/rejected": -5.415094339622642, "step": 552 }, { "epoch": 0.3791566678093932, "grad_norm": 0.20837831962494152, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150589043.61290324, "logits/rejected": 71430268.12121212, "logps/chosen": -256.258064516129, "logps/rejected": -345.6969696969697, "loss": 0.1828, "rewards/chosen": 1.2711693548387097, "rewards/margins": 8.30147238514174, "rewards/rejected": -7.03030303030303, "step": 553 }, { "epoch": 0.3798423037367158, "grad_norm": 0.15596413763190797, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118678175.47540984, "logits/rejected": 123450261.01492538, "logps/chosen": -280.1311475409836, "logps/rejected": -391.4029850746269, "loss": 0.1664, "rewards/chosen": 1.4651639344262295, "rewards/margins": 7.838298262784439, "rewards/rejected": -6.373134328358209, "step": 554 }, { "epoch": 0.3805279396640384, "grad_norm": 0.20682233176354003, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113521244.32786885, "logits/rejected": 71991785.07462686, "logps/chosen": -265.44262295081967, "logps/rejected": -419.34328358208955, "loss": 0.147, "rewards/chosen": 1.8442622950819672, "rewards/margins": 8.702471250305848, "rewards/rejected": -6.858208955223881, "step": 555 }, { "epoch": 0.381213575591361, "grad_norm": 0.18526042012242183, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121441232.73846154, "logits/rejected": 144470471.1111111, "logps/chosen": -233.84615384615384, "logps/rejected": -422.0952380952381, "loss": 0.1558, "rewards/chosen": 1.7615384615384615, "rewards/margins": 8.245665445665445, "rewards/rejected": -6.484126984126984, "step": 556 }, { "epoch": 0.38189921151868356, "grad_norm": 0.16102883212281366, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162546469.7704918, "logits/rejected": 68869043.58208956, "logps/chosen": -254.68852459016392, "logps/rejected": -339.1044776119403, "loss": 0.1773, "rewards/chosen": 1.271516393442623, "rewards/margins": 8.293904453144116, "rewards/rejected": -7.022388059701493, "step": 557 }, { "epoch": 0.3825848474460062, "grad_norm": 0.31141411938678, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 192139068.95238096, "logits/rejected": 70964397.29230769, "logps/chosen": -299.42857142857144, "logps/rejected": -384.9846153846154, "loss": 0.1796, "rewards/chosen": 1.4623015873015872, "rewards/margins": 5.693070818070818, "rewards/rejected": -4.230769230769231, "step": 558 }, { "epoch": 0.38327048337332875, "grad_norm": 0.16495141110253572, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93847552.0, "logits/rejected": 116421063.1111111, "logps/chosen": -266.57142857142856, "logps/rejected": -384.44444444444446, "loss": 0.18, "rewards/chosen": 1.3900669642857142, "rewards/margins": 7.542844742063492, "rewards/rejected": -6.152777777777778, "step": 559 }, { "epoch": 0.3839561193006514, "grad_norm": 0.17062725910537357, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118073614.49056605, "logits/rejected": 84669016.74666667, "logps/chosen": -185.96226415094338, "logps/rejected": -358.82666666666665, "loss": 0.1259, "rewards/chosen": 1.4622641509433962, "rewards/margins": 8.122264150943396, "rewards/rejected": -6.66, "step": 560 }, { "epoch": 0.38464175522797395, "grad_norm": 0.1810760721805791, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132311226.18181819, "logits/rejected": 77865224.25806452, "logps/chosen": -261.57575757575756, "logps/rejected": -350.4516129032258, "loss": 0.1712, "rewards/chosen": 1.7973484848484849, "rewards/margins": 8.047348484848484, "rewards/rejected": -6.25, "step": 561 }, { "epoch": 0.3853273911552965, "grad_norm": 0.18693169417272837, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 192238933.33333334, "logits/rejected": 101280105.41176471, "logps/chosen": -248.0, "logps/rejected": -426.3529411764706, "loss": 0.1871, "rewards/chosen": 1.2447916666666667, "rewards/margins": 4.8036151960784315, "rewards/rejected": -3.5588235294117645, "step": 562 }, { "epoch": 0.38601302708261914, "grad_norm": 0.2058273334953361, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105906176.0, "logits/rejected": 96239616.0, "logps/chosen": -245.0, "logps/rejected": -321.25, "loss": 0.1715, "rewards/chosen": 1.740234375, "rewards/margins": 6.708984375, "rewards/rejected": -4.96875, "step": 563 }, { "epoch": 0.3866986630099417, "grad_norm": 0.15704171513613546, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142479235.87878788, "logits/rejected": 110810805.67741935, "logps/chosen": -215.27272727272728, "logps/rejected": -367.48387096774195, "loss": 0.1776, "rewards/chosen": 1.1799242424242424, "rewards/margins": 6.034762952101661, "rewards/rejected": -4.854838709677419, "step": 564 }, { "epoch": 0.38738429893726434, "grad_norm": 0.17300151298067928, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 69611916.38709678, "logits/rejected": 97231592.72727273, "logps/chosen": -196.6451612903226, "logps/rejected": -309.8181818181818, "loss": 0.1733, "rewards/chosen": 1.3125, "rewards/margins": 8.214015151515152, "rewards/rejected": -6.901515151515151, "step": 565 }, { "epoch": 0.3880699348645869, "grad_norm": 0.1872145937777069, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 174425193.93103448, "logits/rejected": 85324127.08571428, "logps/chosen": -309.2413793103448, "logps/rejected": -373.48571428571427, "loss": 0.1968, "rewards/chosen": 0.6109913793103449, "rewards/margins": 7.310991379310345, "rewards/rejected": -6.7, "step": 566 }, { "epoch": 0.3887555707919095, "grad_norm": 0.27137969322554234, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108488490.02985075, "logits/rejected": 117509271.08196722, "logps/chosen": -241.91044776119404, "logps/rejected": -322.88524590163934, "loss": 0.2079, "rewards/chosen": 1.0513059701492538, "rewards/margins": 5.948846953755811, "rewards/rejected": -4.897540983606557, "step": 567 }, { "epoch": 0.3894412067192321, "grad_norm": 0.2688639125871067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147113647.76119402, "logits/rejected": 57173176.655737706, "logps/chosen": -246.92537313432837, "logps/rejected": -340.1967213114754, "loss": 0.1868, "rewards/chosen": 1.5335820895522387, "rewards/margins": 6.853254220699779, "rewards/rejected": -5.319672131147541, "step": 568 }, { "epoch": 0.3901268426465547, "grad_norm": 0.2584300921148379, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131820982.85714285, "logits/rejected": 89306411.32307692, "logps/chosen": -275.42857142857144, "logps/rejected": -326.89230769230767, "loss": 0.1909, "rewards/chosen": 1.6636904761904763, "rewards/margins": 8.071382783882784, "rewards/rejected": -6.407692307692308, "step": 569 }, { "epoch": 0.3908124785738773, "grad_norm": 0.18185935004524642, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145678220.6197183, "logits/rejected": 143930853.0526316, "logps/chosen": -277.1830985915493, "logps/rejected": -377.2631578947368, "loss": 0.1866, "rewards/chosen": 1.2130281690140845, "rewards/margins": 9.476186063750927, "rewards/rejected": -8.263157894736842, "step": 570 }, { "epoch": 0.39149811450119987, "grad_norm": 0.2041751877740252, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146858096.21917808, "logits/rejected": 72599589.23636363, "logps/chosen": -225.53424657534248, "logps/rejected": -352.0, "loss": 0.1898, "rewards/chosen": 1.3664383561643836, "rewards/margins": 8.09371108343711, "rewards/rejected": -6.7272727272727275, "step": 571 }, { "epoch": 0.39218375042852244, "grad_norm": 0.18321404199054947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107293002.32258065, "logits/rejected": 96532542.06060606, "logps/chosen": -243.3548387096774, "logps/rejected": -366.54545454545456, "loss": 0.1666, "rewards/chosen": 1.502016129032258, "rewards/margins": 6.767167644183774, "rewards/rejected": -5.265151515151516, "step": 572 }, { "epoch": 0.39286938635584506, "grad_norm": 0.18247722226797347, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138531869.25714287, "logits/rejected": 127420062.89655173, "logps/chosen": -250.05714285714285, "logps/rejected": -371.86206896551727, "loss": 0.1965, "rewards/chosen": 0.8785714285714286, "rewards/margins": 8.680295566502464, "rewards/rejected": -7.801724137931035, "step": 573 }, { "epoch": 0.39355502228316763, "grad_norm": 0.1854248169788811, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113652108.38709678, "logits/rejected": 99519394.9090909, "logps/chosen": -193.5483870967742, "logps/rejected": -337.93939393939394, "loss": 0.1696, "rewards/chosen": 1.3659274193548387, "rewards/margins": 7.858351661779081, "rewards/rejected": -6.492424242424242, "step": 574 }, { "epoch": 0.3942406582104902, "grad_norm": 0.1829772052507558, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151265544.2580645, "logits/rejected": 114517209.21212122, "logps/chosen": -264.9032258064516, "logps/rejected": -381.57575757575756, "loss": 0.1629, "rewards/chosen": 1.6794354838709677, "rewards/margins": 9.194586999022484, "rewards/rejected": -7.515151515151516, "step": 575 }, { "epoch": 0.39492629413781283, "grad_norm": 0.19653594049978684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148688076.8, "logits/rejected": 78643200.0, "logps/chosen": -210.13333333333333, "logps/rejected": -357.6470588235294, "loss": 0.1678, "rewards/chosen": 1.3916666666666666, "rewards/margins": 8.457843137254901, "rewards/rejected": -7.0661764705882355, "step": 576 }, { "epoch": 0.3956119300651354, "grad_norm": 0.17603657789148044, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124897052.44444445, "logits/rejected": 105197678.7027027, "logps/chosen": -224.88888888888889, "logps/rejected": -353.72972972972974, "loss": 0.1731, "rewards/chosen": 1.2054398148148149, "rewards/margins": 8.205439814814815, "rewards/rejected": -7.0, "step": 577 }, { "epoch": 0.396297565992458, "grad_norm": 0.19557387823616373, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133758078.24657534, "logits/rejected": 106154021.23636363, "logps/chosen": -220.05479452054794, "logps/rejected": -337.1636363636364, "loss": 0.2073, "rewards/chosen": 1.1147260273972603, "rewards/margins": 7.191998754669988, "rewards/rejected": -6.077272727272727, "step": 578 }, { "epoch": 0.3969832019197806, "grad_norm": 0.18981208358326557, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114002557.90163934, "logits/rejected": 93276312.8358209, "logps/chosen": -186.75409836065575, "logps/rejected": -310.92537313432837, "loss": 0.1997, "rewards/chosen": 1.1557377049180328, "rewards/margins": 6.058722779544898, "rewards/rejected": -4.902985074626866, "step": 579 }, { "epoch": 0.39766883784710316, "grad_norm": 0.2428103311186941, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121667584.0, "logits/rejected": 134742016.0, "logps/chosen": -246.75, "logps/rejected": -437.0, "loss": 0.1913, "rewards/chosen": 0.7958984375, "rewards/margins": 7.5146484375, "rewards/rejected": -6.71875, "step": 580 }, { "epoch": 0.3983544737744258, "grad_norm": 0.1746826423841732, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112306105.37931034, "logits/rejected": 94731351.77142857, "logps/chosen": -212.27586206896552, "logps/rejected": -340.1142857142857, "loss": 0.1553, "rewards/chosen": 1.5129310344827587, "rewards/margins": 7.020073891625616, "rewards/rejected": -5.507142857142857, "step": 581 }, { "epoch": 0.39904010970174836, "grad_norm": 0.22898357224048885, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100758621.0909091, "logits/rejected": 124340818.58064516, "logps/chosen": -216.96969696969697, "logps/rejected": -372.64516129032256, "loss": 0.1802, "rewards/chosen": 1.1174242424242424, "rewards/margins": 7.649682306940371, "rewards/rejected": -6.532258064516129, "step": 582 }, { "epoch": 0.399725745629071, "grad_norm": 0.22807566808156554, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111848106.66666667, "logits/rejected": 141434398.11764705, "logps/chosen": -221.33333333333334, "logps/rejected": -411.29411764705884, "loss": 0.1864, "rewards/chosen": 0.9802083333333333, "rewards/margins": 8.090502450980392, "rewards/rejected": -7.110294117647059, "step": 583 }, { "epoch": 0.40041138155639355, "grad_norm": 0.2198186579625624, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85833435.42857143, "logits/rejected": 153670620.68965518, "logps/chosen": -252.11428571428573, "logps/rejected": -407.17241379310343, "loss": 0.1957, "rewards/chosen": 0.9125, "rewards/margins": 8.395258620689654, "rewards/rejected": -7.482758620689655, "step": 584 }, { "epoch": 0.4010970174837161, "grad_norm": 0.2722739611442363, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158707051.3548387, "logits/rejected": 60944508.121212125, "logps/chosen": -266.83870967741933, "logps/rejected": -306.90909090909093, "loss": 0.1889, "rewards/chosen": 1.5171370967741935, "rewards/margins": 8.15350073313783, "rewards/rejected": -6.636363636363637, "step": 585 }, { "epoch": 0.40178265341103875, "grad_norm": 0.20390160307893185, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158672653.01694915, "logits/rejected": 105906176.0, "logps/chosen": -294.23728813559325, "logps/rejected": -336.92753623188406, "loss": 0.1883, "rewards/chosen": 1.1917372881355932, "rewards/margins": 7.394635838860231, "rewards/rejected": -6.202898550724638, "step": 586 }, { "epoch": 0.4024682893383613, "grad_norm": 0.1515556772392618, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149317222.4, "logits/rejected": 73955448.47058824, "logps/chosen": -275.46666666666664, "logps/rejected": -323.29411764705884, "loss": 0.1331, "rewards/chosen": 1.8125, "rewards/margins": 7.158088235294118, "rewards/rejected": -5.345588235294118, "step": 587 }, { "epoch": 0.40315392526568394, "grad_norm": 0.1628010296029244, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149228921.2631579, "logits/rejected": 59946056.112676054, "logps/chosen": -235.78947368421052, "logps/rejected": -329.6901408450704, "loss": 0.1659, "rewards/chosen": 1.0816885964912282, "rewards/margins": 8.497181554237708, "rewards/rejected": -7.415492957746479, "step": 588 }, { "epoch": 0.4038395611930065, "grad_norm": 0.22779685003131941, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142340030.98412699, "logits/rejected": 88757925.41538462, "logps/chosen": -223.74603174603175, "logps/rejected": -337.96923076923076, "loss": 0.1627, "rewards/chosen": 1.5694444444444444, "rewards/margins": 6.377136752136752, "rewards/rejected": -4.8076923076923075, "step": 589 }, { "epoch": 0.4045251971203291, "grad_norm": 0.19197362308178362, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 182134473.6969697, "logits/rejected": 65552912.51612903, "logps/chosen": -256.969696969697, "logps/rejected": -369.03225806451616, "loss": 0.1877, "rewards/chosen": 1.5416666666666667, "rewards/margins": 8.065860215053764, "rewards/rejected": -6.524193548387097, "step": 590 }, { "epoch": 0.4052108330476517, "grad_norm": 0.18888938619951884, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159776768.0, "logits/rejected": 57147392.0, "logps/chosen": -207.0, "logps/rejected": -378.5, "loss": 0.2066, "rewards/chosen": 1.3271484375, "rewards/margins": 7.8583984375, "rewards/rejected": -6.53125, "step": 591 }, { "epoch": 0.4058964689749743, "grad_norm": 0.18665942400223182, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116020901.41538462, "logits/rejected": 163644432.25396827, "logps/chosen": -196.6769230769231, "logps/rejected": -406.85714285714283, "loss": 0.1876, "rewards/chosen": 1.0009615384615385, "rewards/margins": 6.389850427350428, "rewards/rejected": -5.388888888888889, "step": 592 }, { "epoch": 0.4065821049022969, "grad_norm": 0.18533790388168261, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102103131.70149253, "logits/rejected": 71010941.90163934, "logps/chosen": -256.95522388059703, "logps/rejected": -328.91803278688525, "loss": 0.1863, "rewards/chosen": 1.289179104477612, "rewards/margins": 8.73999877660876, "rewards/rejected": -7.450819672131147, "step": 593 }, { "epoch": 0.4072677408296195, "grad_norm": 0.3171834535390169, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128546556.39436619, "logits/rejected": 75571056.28070176, "logps/chosen": -312.11267605633805, "logps/rejected": -298.94736842105266, "loss": 0.2005, "rewards/chosen": 1.8838028169014085, "rewards/margins": 8.717136150234742, "rewards/rejected": -6.833333333333333, "step": 594 }, { "epoch": 0.40795337675694204, "grad_norm": 0.38161002630140006, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150243725.37313432, "logits/rejected": 62055071.475409836, "logps/chosen": -240.71641791044777, "logps/rejected": -325.7704918032787, "loss": 0.1892, "rewards/chosen": 1.2761194029850746, "rewards/margins": 7.776119402985074, "rewards/rejected": -6.5, "step": 595 }, { "epoch": 0.40863901268426467, "grad_norm": 0.2038648930007314, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131903629.2413793, "logits/rejected": 102715509.02857143, "logps/chosen": -230.06896551724137, "logps/rejected": -411.42857142857144, "loss": 0.1642, "rewards/chosen": 1.0926724137931034, "rewards/margins": -18305538.564470444, "rewards/rejected": 18305539.65714286, "step": 596 }, { "epoch": 0.40932464861158724, "grad_norm": 0.16137411130158766, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140581499.58620688, "logits/rejected": 74568733.25714286, "logps/chosen": -260.9655172413793, "logps/rejected": -324.57142857142856, "loss": 0.1374, "rewards/chosen": 1.7974137931034482, "rewards/margins": 8.761699507389162, "rewards/rejected": -6.964285714285714, "step": 597 }, { "epoch": 0.41001028453890986, "grad_norm": 0.2203331314911408, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93051410.96296297, "logits/rejected": 103610644.75675675, "logps/chosen": -209.33333333333334, "logps/rejected": -374.4864864864865, "loss": 0.1695, "rewards/chosen": 0.6684027777777778, "rewards/margins": 7.195429804804805, "rewards/rejected": -6.527027027027027, "step": 598 }, { "epoch": 0.41069592046623243, "grad_norm": 0.1713387162833604, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142739488.5079365, "logits/rejected": 90790549.66153847, "logps/chosen": -309.07936507936506, "logps/rejected": -361.84615384615387, "loss": 0.1551, "rewards/chosen": 1.5615079365079365, "rewards/margins": 8.907661782661782, "rewards/rejected": -7.346153846153846, "step": 599 }, { "epoch": 0.411381556393555, "grad_norm": 0.28535749456766113, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137093822.17142856, "logits/rejected": 88658908.68965517, "logps/chosen": -264.22857142857146, "logps/rejected": -369.1034482758621, "loss": 0.2269, "rewards/chosen": 0.9571428571428572, "rewards/margins": 6.4743842364532025, "rewards/rejected": -5.517241379310345, "step": 600 }, { "epoch": 0.41206719232087763, "grad_norm": 0.18624274254893164, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118176354.80701755, "logits/rejected": 100190698.36619718, "logps/chosen": -210.10526315789474, "logps/rejected": -335.32394366197184, "loss": 0.1862, "rewards/chosen": 1.4298245614035088, "rewards/margins": 7.1692611811218185, "rewards/rejected": -5.73943661971831, "step": 601 }, { "epoch": 0.4127528282482002, "grad_norm": 0.38032100642404615, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87678190.46575342, "logits/rejected": 188247989.52727273, "logps/chosen": -196.3835616438356, "logps/rejected": -456.1454545454545, "loss": 0.2208, "rewards/chosen": 0.7636986301369864, "rewards/margins": 7.072789539227896, "rewards/rejected": -6.3090909090909095, "step": 602 }, { "epoch": 0.4134384641755228, "grad_norm": 0.16423058838629276, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150535294.24657536, "logits/rejected": 75058976.58181818, "logps/chosen": -305.2054794520548, "logps/rejected": -378.4727272727273, "loss": 0.183, "rewards/chosen": 1.6147260273972603, "rewards/margins": 8.81472602739726, "rewards/rejected": -7.2, "step": 603 }, { "epoch": 0.4141241001028454, "grad_norm": 0.18148917880181403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113485882.51428571, "logits/rejected": 88984328.8275862, "logps/chosen": -235.2, "logps/rejected": -388.9655172413793, "loss": 0.1658, "rewards/chosen": 1.6196428571428572, "rewards/margins": 6.80929802955665, "rewards/rejected": -5.189655172413793, "step": 604 }, { "epoch": 0.41480973603016796, "grad_norm": 0.21678384055204158, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137712981.33333334, "logits/rejected": 89360263.52941176, "logps/chosen": -209.33333333333334, "logps/rejected": -393.4117647058824, "loss": 0.1767, "rewards/chosen": 1.2416666666666667, "rewards/margins": 7.962254901960785, "rewards/rejected": -6.720588235294118, "step": 605 }, { "epoch": 0.4154953719574906, "grad_norm": 0.24872464608375328, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104111945.95555556, "logits/rejected": 96570059.56626506, "logps/chosen": -193.6, "logps/rejected": -382.8433734939759, "loss": 0.1551, "rewards/chosen": 0.7125, "rewards/margins": 7.453463855421687, "rewards/rejected": -6.740963855421687, "step": 606 }, { "epoch": 0.41618100788481316, "grad_norm": 0.16596096804938348, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138089393.23076922, "logits/rejected": 106089260.6984127, "logps/chosen": -206.03076923076924, "logps/rejected": -338.7936507936508, "loss": 0.1738, "rewards/chosen": 1.3865384615384615, "rewards/margins": 6.31510989010989, "rewards/rejected": -4.928571428571429, "step": 607 }, { "epoch": 0.41686664381213573, "grad_norm": 0.17298484221739258, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 189611467.03448275, "logits/rejected": 96169398.85714285, "logps/chosen": -254.06896551724137, "logps/rejected": -439.3142857142857, "loss": 0.1651, "rewards/chosen": 1.3071120689655173, "rewards/margins": 8.542826354679804, "rewards/rejected": -7.235714285714286, "step": 608 }, { "epoch": 0.41755227973945835, "grad_norm": 0.19033867031082333, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109999004.90322581, "logits/rejected": 101997847.27272727, "logps/chosen": -260.38709677419354, "logps/rejected": -360.8484848484849, "loss": 0.1547, "rewards/chosen": 1.65625, "rewards/margins": 7.042613636363637, "rewards/rejected": -5.386363636363637, "step": 609 }, { "epoch": 0.4182379156667809, "grad_norm": 0.2635869860917976, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86173882.18181819, "logits/rejected": 134562465.31506848, "logps/chosen": -258.1818181818182, "logps/rejected": -354.4109589041096, "loss": 0.1591, "rewards/chosen": 1.7840909090909092, "rewards/margins": 7.338885429638855, "rewards/rejected": -5.554794520547945, "step": 610 }, { "epoch": 0.41892355159410355, "grad_norm": 0.19599091072215827, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 176560225.52380952, "logits/rejected": 66883016.86153846, "logps/chosen": -331.6825396825397, "logps/rejected": -408.61538461538464, "loss": 0.1752, "rewards/chosen": 1.5091765873015872, "rewards/margins": 8.77071504884005, "rewards/rejected": -7.2615384615384615, "step": 611 }, { "epoch": 0.4196091875214261, "grad_norm": 0.2204138091862053, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83473525.50819673, "logits/rejected": 136596586.98507464, "logps/chosen": -282.2295081967213, "logps/rejected": -355.82089552238807, "loss": 0.184, "rewards/chosen": 1.5860655737704918, "rewards/margins": 8.003976021531686, "rewards/rejected": -6.417910447761194, "step": 612 }, { "epoch": 0.4202948234487487, "grad_norm": 0.18717220025233666, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 169650206.56716418, "logits/rejected": 29446076.852459017, "logps/chosen": -237.8507462686567, "logps/rejected": -302.95081967213116, "loss": 0.1905, "rewards/chosen": 1.169776119402985, "rewards/margins": 7.932071201370198, "rewards/rejected": -6.762295081967213, "step": 613 }, { "epoch": 0.4209804593760713, "grad_norm": 0.16563606716022827, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 202394233.01818183, "logits/rejected": 64293509.26027397, "logps/chosen": -298.1818181818182, "logps/rejected": -367.3424657534247, "loss": 0.1734, "rewards/chosen": 1.4068181818181817, "rewards/margins": 8.352023661270236, "rewards/rejected": -6.945205479452055, "step": 614 }, { "epoch": 0.4216660953033939, "grad_norm": 0.19366765491718102, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111527230.95081967, "logits/rejected": 85826728.11940299, "logps/chosen": -212.45901639344262, "logps/rejected": -330.02985074626866, "loss": 0.1743, "rewards/chosen": 0.8463114754098361, "rewards/margins": 7.547804012723269, "rewards/rejected": -6.701492537313433, "step": 615 }, { "epoch": 0.4223517312307165, "grad_norm": 0.22975387828823718, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 175592048.81355932, "logits/rejected": 93353657.50724638, "logps/chosen": -310.23728813559325, "logps/rejected": -326.95652173913044, "loss": 0.1726, "rewards/chosen": 1.7341101694915255, "rewards/margins": 7.784834807172684, "rewards/rejected": -6.050724637681159, "step": 616 }, { "epoch": 0.4230373671580391, "grad_norm": 0.17655055580048767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126184569.49152543, "logits/rejected": 80603581.2173913, "logps/chosen": -224.0, "logps/rejected": -352.92753623188406, "loss": 0.1591, "rewards/chosen": 1.3548728813559323, "rewards/margins": 9.246177229182019, "rewards/rejected": -7.891304347826087, "step": 617 }, { "epoch": 0.42372300308536165, "grad_norm": 0.1583435387964769, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168820736.0, "logits/rejected": 36990390.85714286, "logps/chosen": -204.77777777777777, "logps/rejected": -353.42857142857144, "loss": 0.178, "rewards/chosen": 1.5998263888888888, "rewards/margins": 9.992683531746032, "rewards/rejected": -8.392857142857142, "step": 618 }, { "epoch": 0.4244086390126843, "grad_norm": 0.21561157909062617, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 56262656.0, "logits/rejected": 147324928.0, "logps/chosen": -164.5, "logps/rejected": -392.25, "loss": 0.2093, "rewards/chosen": 0.736328125, "rewards/margins": 7.330078125, "rewards/rejected": -6.59375, "step": 619 }, { "epoch": 0.42509427494000684, "grad_norm": 0.1887589055678199, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 178664830.08955225, "logits/rejected": 98738041.70491803, "logps/chosen": -194.38805970149255, "logps/rejected": -392.91803278688525, "loss": 0.1775, "rewards/chosen": 1.4155783582089552, "rewards/margins": 7.399184915586004, "rewards/rejected": -5.983606557377049, "step": 620 }, { "epoch": 0.42577991086732947, "grad_norm": 0.2762383115550189, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114008808.72727273, "logits/rejected": 114802159.48387097, "logps/chosen": -260.1212121212121, "logps/rejected": -378.3225806451613, "loss": 0.1933, "rewards/chosen": 1.074810606060606, "rewards/margins": 7.349004154447703, "rewards/rejected": -6.274193548387097, "step": 621 }, { "epoch": 0.42646554679465204, "grad_norm": 0.20122027890499775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134475839.0153846, "logits/rejected": 157377942.34920636, "logps/chosen": -270.03076923076924, "logps/rejected": -448.5079365079365, "loss": 0.1633, "rewards/chosen": 1.2019230769230769, "rewards/margins": 8.495573870573871, "rewards/rejected": -7.2936507936507935, "step": 622 }, { "epoch": 0.4271511827219746, "grad_norm": 0.19722188678230204, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166181217.10344827, "logits/rejected": 67168782.62857144, "logps/chosen": -290.48275862068965, "logps/rejected": -284.8, "loss": 0.1695, "rewards/chosen": 1.3992456896551724, "rewards/margins": 8.677817118226601, "rewards/rejected": -7.2785714285714285, "step": 623 }, { "epoch": 0.42783681864929723, "grad_norm": 0.18285994494052626, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 174203426.13333333, "logits/rejected": 109021063.52941176, "logps/chosen": -250.66666666666666, "logps/rejected": -383.05882352941177, "loss": 0.1661, "rewards/chosen": 1.38125, "rewards/margins": 6.373897058823529, "rewards/rejected": -4.992647058823529, "step": 624 }, { "epoch": 0.4285224545766198, "grad_norm": 0.18334425871264656, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148238687.08571428, "logits/rejected": 129589530.48275863, "logps/chosen": -272.9142857142857, "logps/rejected": -404.9655172413793, "loss": 0.1987, "rewards/chosen": 1.2767857142857142, "rewards/margins": 8.949199507389162, "rewards/rejected": -7.672413793103448, "step": 625 }, { "epoch": 0.42920809050394243, "grad_norm": 0.16888086156909804, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 190436118.45614034, "logits/rejected": 80430209.8028169, "logps/chosen": -254.87719298245614, "logps/rejected": -370.92957746478874, "loss": 0.1592, "rewards/chosen": 1.6644736842105263, "rewards/margins": 8.079966641957006, "rewards/rejected": -6.415492957746479, "step": 626 }, { "epoch": 0.429893726431265, "grad_norm": 0.2251017878297202, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117893950.27027027, "logits/rejected": 148975464.2962963, "logps/chosen": -290.7027027027027, "logps/rejected": -376.8888888888889, "loss": 0.2094, "rewards/chosen": 1.1182432432432432, "rewards/margins": 9.229354354354355, "rewards/rejected": -8.11111111111111, "step": 627 }, { "epoch": 0.43057936235858757, "grad_norm": 0.3638940930140111, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113169947.92727272, "logits/rejected": 111235240.32876712, "logps/chosen": -217.01818181818183, "logps/rejected": -346.73972602739724, "loss": 0.1403, "rewards/chosen": 1.2227272727272727, "rewards/margins": 7.804919053549191, "rewards/rejected": -6.582191780821918, "step": 628 }, { "epoch": 0.4312649982859102, "grad_norm": 0.15386580099071928, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133422256.55172414, "logits/rejected": 93053630.17142858, "logps/chosen": -182.06896551724137, "logps/rejected": -390.4, "loss": 0.1555, "rewards/chosen": 1.3491379310344827, "rewards/margins": 8.841995073891626, "rewards/rejected": -7.492857142857143, "step": 629 }, { "epoch": 0.43195063421323276, "grad_norm": 0.17233179569328974, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112122733.71428572, "logits/rejected": 146101589.33333334, "logps/chosen": -272.2857142857143, "logps/rejected": -432.0, "loss": 0.1539, "rewards/chosen": 0.875, "rewards/margins": 8.083333333333332, "rewards/rejected": -7.208333333333333, "step": 630 }, { "epoch": 0.4326362701405554, "grad_norm": 0.22227741346067903, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159143877.4857143, "logits/rejected": 34928428.137931034, "logps/chosen": -285.48571428571427, "logps/rejected": -340.6896551724138, "loss": 0.2062, "rewards/chosen": 1.332142857142857, "rewards/margins": 6.754556650246306, "rewards/rejected": -5.422413793103448, "step": 631 }, { "epoch": 0.43332190606787796, "grad_norm": 0.2879975124187095, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110834483.2, "logits/rejected": 105212265.41176471, "logps/chosen": -248.53333333333333, "logps/rejected": -360.47058823529414, "loss": 0.2036, "rewards/chosen": 1.3208333333333333, "rewards/margins": 7.967892156862746, "rewards/rejected": -6.647058823529412, "step": 632 }, { "epoch": 0.43400754199520053, "grad_norm": 0.20220932601488983, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116719616.0, "logits/rejected": 76173312.0, "logps/chosen": -243.0, "logps/rejected": -349.5, "loss": 0.1853, "rewards/chosen": 1.1611328125, "rewards/margins": 7.9423828125, "rewards/rejected": -6.78125, "step": 633 }, { "epoch": 0.43469317792252316, "grad_norm": 0.27010603361928714, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153391689.14285713, "logits/rejected": 97953130.33846153, "logps/chosen": -325.58730158730157, "logps/rejected": -319.0153846153846, "loss": 0.1768, "rewards/chosen": 1.183531746031746, "rewards/margins": 7.152762515262515, "rewards/rejected": -5.969230769230769, "step": 634 }, { "epoch": 0.4353788138498457, "grad_norm": 0.22774108951014577, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121760019.10447761, "logits/rejected": 120672188.85245901, "logps/chosen": -260.05970149253733, "logps/rejected": -354.62295081967216, "loss": 0.1811, "rewards/chosen": 1.6343283582089552, "rewards/margins": 7.2326890139466595, "rewards/rejected": -5.598360655737705, "step": 635 }, { "epoch": 0.43606444977716835, "grad_norm": 0.22727786803845304, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151981839.05882353, "logits/rejected": 73911500.8, "logps/chosen": -264.70588235294116, "logps/rejected": -329.06666666666666, "loss": 0.206, "rewards/chosen": 1.1295955882352942, "rewards/margins": 9.187928921568627, "rewards/rejected": -8.058333333333334, "step": 636 }, { "epoch": 0.4367500857044909, "grad_norm": 0.22073349180727395, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158717789.46031746, "logits/rejected": 82272886.15384616, "logps/chosen": -250.66666666666666, "logps/rejected": -354.95384615384614, "loss": 0.1978, "rewards/chosen": 1.1279761904761905, "rewards/margins": 7.8356684981684985, "rewards/rejected": -6.707692307692308, "step": 637 }, { "epoch": 0.4374357216318135, "grad_norm": 0.2699798030335838, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99398836.70588236, "logits/rejected": 130303044.26666667, "logps/chosen": -215.52941176470588, "logps/rejected": -419.73333333333335, "loss": 0.2278, "rewards/chosen": 1.0533088235294117, "rewards/margins": 8.494975490196078, "rewards/rejected": -7.441666666666666, "step": 638 }, { "epoch": 0.4381213575591361, "grad_norm": 0.21829304940614294, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70892345.31343284, "logits/rejected": 128854519.60655738, "logps/chosen": -245.73134328358208, "logps/rejected": -338.3606557377049, "loss": 0.1844, "rewards/chosen": 1.0522388059701493, "rewards/margins": 7.421091264986543, "rewards/rejected": -6.368852459016393, "step": 639 }, { "epoch": 0.4388069934864587, "grad_norm": 0.14065065616303207, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 187695104.0, "logits/rejected": 81526784.0, "logps/chosen": -262.0, "logps/rejected": -351.1111111111111, "loss": 0.1358, "rewards/chosen": 1.515625, "rewards/margins": 9.293402777777779, "rewards/rejected": -7.777777777777778, "step": 640 }, { "epoch": 0.43949262941378126, "grad_norm": 0.24627380619849698, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125541838.90410958, "logits/rejected": 134446508.21818182, "logps/chosen": -305.09589041095893, "logps/rejected": -330.1818181818182, "loss": 0.1844, "rewards/chosen": 1.4058219178082192, "rewards/margins": 7.733094645080946, "rewards/rejected": -6.327272727272727, "step": 641 }, { "epoch": 0.4401782653411039, "grad_norm": 0.23621117836909616, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93113548.8, "logits/rejected": 167613884.3773585, "logps/chosen": -205.22666666666666, "logps/rejected": -442.5660377358491, "loss": 0.2008, "rewards/chosen": 1.355, "rewards/margins": 7.55311320754717, "rewards/rejected": -6.19811320754717, "step": 642 }, { "epoch": 0.44086390126842645, "grad_norm": 0.18023623355311047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91629410.46153846, "logits/rejected": 132753050.41269842, "logps/chosen": -224.24615384615385, "logps/rejected": -353.26984126984127, "loss": 0.1806, "rewards/chosen": 1.435576923076923, "rewards/margins": -5051672.977121489, "rewards/rejected": 5051674.412698412, "step": 643 }, { "epoch": 0.4415495371957491, "grad_norm": 0.19514113439705502, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152416741.9661017, "logits/rejected": 97927880.3478261, "logps/chosen": -258.3050847457627, "logps/rejected": -392.3478260869565, "loss": 0.1736, "rewards/chosen": 1.840042372881356, "rewards/margins": 8.006709039548022, "rewards/rejected": -6.166666666666667, "step": 644 }, { "epoch": 0.44223517312307165, "grad_norm": 0.16688690901214484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79831586.13333334, "logits/rejected": 118427407.05882353, "logps/chosen": -193.86666666666667, "logps/rejected": -359.52941176470586, "loss": 0.1694, "rewards/chosen": 1.4520833333333334, "rewards/margins": 9607067.805024508, "rewards/rejected": -9607066.352941176, "step": 645 }, { "epoch": 0.4429208090503942, "grad_norm": 0.16519864375601556, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 187695104.0, "logits/rejected": 66701084.44444445, "logps/chosen": -247.42857142857142, "logps/rejected": -335.55555555555554, "loss": 0.1754, "rewards/chosen": 0.44419642857142855, "rewards/margins": 6.756696428571429, "rewards/rejected": -6.3125, "step": 646 }, { "epoch": 0.44360644497771684, "grad_norm": 0.18388852375940062, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 175636480.0, "logits/rejected": 54099968.0, "logps/chosen": -297.75, "logps/rejected": -301.5, "loss": 0.1752, "rewards/chosen": 1.162109375, "rewards/margins": 6.708984375, "rewards/rejected": -5.546875, "step": 647 }, { "epoch": 0.4442920809050394, "grad_norm": 0.1556518670261078, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161007153.5483871, "logits/rejected": 71326999.27272727, "logps/chosen": -224.6451612903226, "logps/rejected": -301.09090909090907, "loss": 0.1683, "rewards/chosen": 1.467741935483871, "rewards/margins": 6.285923753665689, "rewards/rejected": -4.818181818181818, "step": 648 }, { "epoch": 0.44497771683236204, "grad_norm": 0.2271518579329875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110298859.24324325, "logits/rejected": 94915546.07407407, "logps/chosen": -215.35135135135135, "logps/rejected": -432.0, "loss": 0.2339, "rewards/chosen": 0.8023648648648649, "rewards/margins": 5.376438938938939, "rewards/rejected": -4.574074074074074, "step": 649 }, { "epoch": 0.4456633527596846, "grad_norm": 0.2322376919785624, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123863040.0, "logits/rejected": 135135232.0, "logps/chosen": -222.0, "logps/rejected": -411.75, "loss": 0.2094, "rewards/chosen": 1.11328125, "rewards/margins": 7.05078125, "rewards/rejected": -5.9375, "step": 650 }, { "epoch": 0.4463489886870072, "grad_norm": 0.17669498775204012, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140509184.0, "logits/rejected": 82960865.88235295, "logps/chosen": -246.13333333333333, "logps/rejected": -381.1764705882353, "loss": 0.1924, "rewards/chosen": 1.1020833333333333, "rewards/margins": 6.396200980392157, "rewards/rejected": -5.294117647058823, "step": 651 }, { "epoch": 0.4470346246143298, "grad_norm": 0.20488299057676926, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130862284.8, "logits/rejected": 72394836.16438356, "logps/chosen": -252.65454545454546, "logps/rejected": -336.2191780821918, "loss": 0.1639, "rewards/chosen": 1.2159090909090908, "rewards/margins": 8.441936488169365, "rewards/rejected": -7.226027397260274, "step": 652 }, { "epoch": 0.44772026054165237, "grad_norm": 0.16294779170672044, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173184165.16129032, "logits/rejected": 73376488.72727273, "logps/chosen": -263.48387096774195, "logps/rejected": -344.0, "loss": 0.198, "rewards/chosen": 1.094758064516129, "rewards/margins": 7.670515640273705, "rewards/rejected": -6.575757575757576, "step": 653 }, { "epoch": 0.448405896468975, "grad_norm": 0.13701956813582136, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112158795.85185185, "logits/rejected": 84679596.97297297, "logps/chosen": -174.66666666666666, "logps/rejected": -361.94594594594594, "loss": 0.1484, "rewards/chosen": 1.4131944444444444, "rewards/margins": 8.521302552552552, "rewards/rejected": -7.108108108108108, "step": 654 }, { "epoch": 0.44909153239629757, "grad_norm": 0.16295413976083933, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135374777.37931034, "logits/rejected": 98626062.62857144, "logps/chosen": -253.79310344827587, "logps/rejected": -381.7142857142857, "loss": 0.1502, "rewards/chosen": 1.8922413793103448, "rewards/margins": 8.735098522167487, "rewards/rejected": -6.8428571428571425, "step": 655 }, { "epoch": 0.44977716832362014, "grad_norm": 0.2073919463326302, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76729371.67567568, "logits/rejected": 119382319.4074074, "logps/chosen": -199.78378378378378, "logps/rejected": -333.9259259259259, "loss": 0.2307, "rewards/chosen": 0.8492398648648649, "rewards/margins": 6.589980605605605, "rewards/rejected": -5.7407407407407405, "step": 656 }, { "epoch": 0.45046280425094276, "grad_norm": 0.18470699212233316, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128993244.07017544, "logits/rejected": 102199238.30985916, "logps/chosen": -206.59649122807016, "logps/rejected": -335.77464788732397, "loss": 0.1982, "rewards/chosen": 0.8977521929824561, "rewards/margins": 5.418878953545836, "rewards/rejected": -4.52112676056338, "step": 657 }, { "epoch": 0.45114844017826533, "grad_norm": 0.17432455505385333, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 46198131.01449275, "logits/rejected": 144099223.8644068, "logps/chosen": -173.56521739130434, "logps/rejected": -416.8135593220339, "loss": 0.2086, "rewards/chosen": 0.7083333333333334, "rewards/margins": 9.013418079096045, "rewards/rejected": -8.305084745762711, "step": 658 }, { "epoch": 0.45183407610558796, "grad_norm": 0.2152844060084427, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153121633.35211268, "logits/rejected": 55914855.298245616, "logps/chosen": -239.77464788732394, "logps/rejected": -353.96491228070175, "loss": 0.1995, "rewards/chosen": 1.397887323943662, "rewards/margins": 8.345255744996294, "rewards/rejected": -6.947368421052632, "step": 659 }, { "epoch": 0.4525197120329105, "grad_norm": 0.2093261111023891, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 78788387.44615385, "logits/rejected": 166573787.42857143, "logps/chosen": -252.33846153846153, "logps/rejected": -370.7936507936508, "loss": 0.1756, "rewards/chosen": 1.0817307692307692, "rewards/margins": 8.343635531135531, "rewards/rejected": -7.261904761904762, "step": 660 }, { "epoch": 0.4532053479602331, "grad_norm": 0.1753902705269229, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139549470.37288135, "logits/rejected": 100724083.01449275, "logps/chosen": -210.03389830508473, "logps/rejected": -379.82608695652175, "loss": 0.1604, "rewards/chosen": 1.7171610169491525, "rewards/margins": 8.738900147383935, "rewards/rejected": -7.021739130434782, "step": 661 }, { "epoch": 0.4538909838875557, "grad_norm": 0.17022146786723846, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94663111.1111111, "logits/rejected": 121971858.28571428, "logps/chosen": -207.11111111111111, "logps/rejected": -341.14285714285717, "loss": 0.1721, "rewards/chosen": 1.1041666666666667, "rewards/margins": 4.925595238095238, "rewards/rejected": -3.8214285714285716, "step": 662 }, { "epoch": 0.4545766198148783, "grad_norm": 0.2489574114145018, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109051904.0, "logits/rejected": 103809024.0, "logps/chosen": -232.26666666666668, "logps/rejected": -359.52941176470586, "loss": 0.2252, "rewards/chosen": 1.2822916666666666, "rewards/margins": 7.061703431372549, "rewards/rejected": -5.779411764705882, "step": 663 }, { "epoch": 0.4552622557422009, "grad_norm": 0.1775006064277273, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116999006.31578948, "logits/rejected": 83946574.76923077, "logps/chosen": -215.57894736842104, "logps/rejected": -349.2307692307692, "loss": 0.184, "rewards/chosen": 1.618421052631579, "rewards/margins": 5.47419028340081, "rewards/rejected": -3.855769230769231, "step": 664 }, { "epoch": 0.4559478916695235, "grad_norm": 0.23627018121021062, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105683750.78787878, "logits/rejected": 72182618.83870968, "logps/chosen": -237.0909090909091, "logps/rejected": -347.61290322580646, "loss": 0.1741, "rewards/chosen": 1.6856060606060606, "rewards/margins": 7.750122189638319, "rewards/rejected": -6.064516129032258, "step": 665 }, { "epoch": 0.45663352759684606, "grad_norm": 0.2418630636521768, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153409846.3030303, "logits/rejected": 86321482.32258065, "logps/chosen": -296.969696969697, "logps/rejected": -339.0967741935484, "loss": 0.2129, "rewards/chosen": 1.2083333333333333, "rewards/margins": -3716591.565860215, "rewards/rejected": 3716592.7741935486, "step": 666 }, { "epoch": 0.4573191635241687, "grad_norm": 0.17760836819360204, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118076012.60606061, "logits/rejected": 102963398.19354838, "logps/chosen": -199.75757575757575, "logps/rejected": -356.64516129032256, "loss": 0.1891, "rewards/chosen": 1.25, "rewards/margins": 7.137096774193548, "rewards/rejected": -5.887096774193548, "step": 667 }, { "epoch": 0.45800479945149125, "grad_norm": 0.17037697255022305, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138557667.55555555, "logits/rejected": 75327432.64864865, "logps/chosen": -307.55555555555554, "logps/rejected": -321.72972972972974, "loss": 0.1553, "rewards/chosen": 2.2847222222222223, "rewards/margins": 9.406343843843844, "rewards/rejected": -7.121621621621622, "step": 668 }, { "epoch": 0.4586904353788139, "grad_norm": 0.1803604053314074, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 163630284.8, "logits/rejected": 88364373.33333333, "logps/chosen": -264.6, "logps/rejected": -372.3333333333333, "loss": 0.2003, "rewards/chosen": 1.53984375, "rewards/margins": 18678165.53984375, "rewards/rejected": -18678164.0, "step": 669 }, { "epoch": 0.45937607130613645, "grad_norm": 0.18024967447696258, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135639525.9661017, "logits/rejected": 103885007.76811594, "logps/chosen": -286.64406779661016, "logps/rejected": -345.04347826086956, "loss": 0.1548, "rewards/chosen": 1.8771186440677967, "rewards/margins": 10.210451977401132, "rewards/rejected": -8.333333333333334, "step": 670 }, { "epoch": 0.460061707233459, "grad_norm": 0.17211905138806058, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 177462448.55172414, "logits/rejected": 67917765.48571429, "logps/chosen": -270.8965517241379, "logps/rejected": -365.25714285714287, "loss": 0.1495, "rewards/chosen": 1.2823275862068966, "rewards/margins": 8.382327586206896, "rewards/rejected": -7.1, "step": 671 }, { "epoch": 0.46074734316078164, "grad_norm": 0.2739707511164654, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151359666.08695653, "logits/rejected": 102795992.94915254, "logps/chosen": -266.4347826086956, "logps/rejected": -426.8474576271187, "loss": 0.2024, "rewards/chosen": 1.2042572463768115, "rewards/margins": 6.983918263325964, "rewards/rejected": -5.779661016949152, "step": 672 }, { "epoch": 0.4614329790881042, "grad_norm": 0.3125544779225738, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117921825.5737705, "logits/rejected": 91359140.29850747, "logps/chosen": -193.8360655737705, "logps/rejected": -353.910447761194, "loss": 0.1855, "rewards/chosen": 1.1209016393442623, "rewards/margins": 8.934334475165159, "rewards/rejected": -7.813432835820896, "step": 673 }, { "epoch": 0.4621186150154268, "grad_norm": 0.20190289418541196, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111021955.87878788, "logits/rejected": 130429324.38709678, "logps/chosen": -177.45454545454547, "logps/rejected": -409.80645161290323, "loss": 0.1761, "rewards/chosen": 1.2585227272727273, "rewards/margins": 8.80690982404692, "rewards/rejected": -7.548387096774194, "step": 674 }, { "epoch": 0.4628042509427494, "grad_norm": 0.1822300890842506, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147234533.5172414, "logits/rejected": 87855689.14285715, "logps/chosen": -210.48275862068965, "logps/rejected": -371.65714285714284, "loss": 0.1443, "rewards/chosen": 1.6379310344827587, "rewards/margins": 9.016502463054188, "rewards/rejected": -7.378571428571429, "step": 675 }, { "epoch": 0.463489886870072, "grad_norm": 0.20026705669207945, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125558519.74193548, "logits/rejected": 95833491.39393939, "logps/chosen": -201.5483870967742, "logps/rejected": -334.54545454545456, "loss": 0.1576, "rewards/chosen": 1.1481854838709677, "rewards/margins": 8.595155180840665, "rewards/rejected": -7.446969696969697, "step": 676 }, { "epoch": 0.4641755227973946, "grad_norm": 0.22282283047512133, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96104269.91304348, "logits/rejected": 103506891.9322034, "logps/chosen": -208.69565217391303, "logps/rejected": -339.2542372881356, "loss": 0.1921, "rewards/chosen": 1.1748188405797102, "rewards/margins": 6.234140874478015, "rewards/rejected": -5.059322033898305, "step": 677 }, { "epoch": 0.46486115872471717, "grad_norm": 0.1985330786536146, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133824512.0, "logits/rejected": 53870592.0, "logps/chosen": -194.5, "logps/rejected": -336.25, "loss": 0.1855, "rewards/chosen": 0.94140625, "rewards/margins": 9.00390625, "rewards/rejected": -8.0625, "step": 678 }, { "epoch": 0.46554679465203974, "grad_norm": 0.16879244265630405, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124510910.17142858, "logits/rejected": 99831666.7586207, "logps/chosen": -223.31428571428572, "logps/rejected": -345.1034482758621, "loss": 0.1954, "rewards/chosen": 1.4205357142857142, "rewards/margins": 8.33432881773399, "rewards/rejected": -6.913793103448276, "step": 679 }, { "epoch": 0.46623243057936237, "grad_norm": 0.23848686626011045, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104624583.1111111, "logits/rejected": 121260324.57142857, "logps/chosen": -203.77777777777777, "logps/rejected": -385.7142857142857, "loss": 0.1835, "rewards/chosen": 1.4496527777777777, "rewards/margins": 7.547867063492063, "rewards/rejected": -6.098214285714286, "step": 680 }, { "epoch": 0.46691806650668494, "grad_norm": 0.22819910696869852, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111487306.32258065, "logits/rejected": 73622745.21212122, "logps/chosen": -214.32258064516128, "logps/rejected": -289.2121212121212, "loss": 0.1869, "rewards/chosen": 1.1804435483870968, "rewards/margins": 7.1160496089931575, "rewards/rejected": -5.9356060606060606, "step": 681 }, { "epoch": 0.46760370243400756, "grad_norm": 0.16938915025970427, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82656715.03448276, "logits/rejected": 85623720.22857143, "logps/chosen": -219.72413793103448, "logps/rejected": -372.57142857142856, "loss": 0.1686, "rewards/chosen": 1.1675646551724137, "rewards/margins": 8.881850369458128, "rewards/rejected": -7.714285714285714, "step": 682 }, { "epoch": 0.46828933836133013, "grad_norm": 0.16616355933767846, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110869435.73333333, "logits/rejected": 96777396.70588236, "logps/chosen": -178.13333333333333, "logps/rejected": -345.88235294117646, "loss": 0.1691, "rewards/chosen": 1.2, "rewards/margins": 8.464705882352941, "rewards/rejected": -7.264705882352941, "step": 683 }, { "epoch": 0.4689749742886527, "grad_norm": 0.26820175986374445, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134816914.2857143, "logits/rejected": 106448542.89655173, "logps/chosen": -215.54285714285714, "logps/rejected": -391.17241379310343, "loss": 0.1754, "rewards/chosen": 1.4357142857142857, "rewards/margins": -29179542.840147782, "rewards/rejected": 29179544.275862068, "step": 684 }, { "epoch": 0.4696606102159753, "grad_norm": 0.2082294682213742, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92655988.36363636, "logits/rejected": 76546048.0, "logps/chosen": -195.87878787878788, "logps/rejected": -349.4193548387097, "loss": 0.1996, "rewards/chosen": 0.7102272727272727, "rewards/margins": 7.8715175953079175, "rewards/rejected": -7.161290322580645, "step": 685 }, { "epoch": 0.4703462461432979, "grad_norm": 0.19840894833085795, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116790690.25352113, "logits/rejected": 81181857.68421052, "logps/chosen": -216.11267605633802, "logps/rejected": -354.5263157894737, "loss": 0.2043, "rewards/chosen": 1.2297535211267605, "rewards/margins": 8.282385100074128, "rewards/rejected": -7.052631578947368, "step": 686 }, { "epoch": 0.4710318820706205, "grad_norm": 0.367098941445939, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121770116.12903225, "logits/rejected": 120618015.03030303, "logps/chosen": -233.80645161290323, "logps/rejected": -358.3030303030303, "loss": 0.196, "rewards/chosen": 0.9969758064516129, "rewards/margins": 7.0197030791788855, "rewards/rejected": -6.0227272727272725, "step": 687 }, { "epoch": 0.4717175179979431, "grad_norm": 0.15563617723446846, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107304277.33333333, "logits/rejected": 116823702.58823529, "logps/chosen": -253.6, "logps/rejected": -354.8235294117647, "loss": 0.1626, "rewards/chosen": 1.615625, "rewards/margins": 9.086213235294117, "rewards/rejected": -7.470588235294118, "step": 688 }, { "epoch": 0.47240315392526566, "grad_norm": 0.21050717289835566, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115784865.68421052, "logits/rejected": 111330540.3076923, "logps/chosen": -217.05263157894737, "logps/rejected": -353.2307692307692, "loss": 0.2186, "rewards/chosen": 1.162828947368421, "rewards/margins": 7.225328947368421, "rewards/rejected": -6.0625, "step": 689 }, { "epoch": 0.4730887898525883, "grad_norm": 0.2479276295090414, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143138946.03174603, "logits/rejected": 101437629.04615384, "logps/chosen": -269.46031746031747, "logps/rejected": -389.4153846153846, "loss": 0.1538, "rewards/chosen": 1.1617063492063493, "rewards/margins": 7.884783272283272, "rewards/rejected": -6.723076923076923, "step": 690 }, { "epoch": 0.47377442577991086, "grad_norm": 0.17640147262681857, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117227242.30508475, "logits/rejected": 78476035.71014492, "logps/chosen": -267.93220338983053, "logps/rejected": -368.69565217391306, "loss": 0.154, "rewards/chosen": 1.7648305084745763, "rewards/margins": 8.902511667894867, "rewards/rejected": -7.13768115942029, "step": 691 }, { "epoch": 0.4744600617072335, "grad_norm": 0.17278286857030667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109120663.08196722, "logits/rejected": 125203104.47761194, "logps/chosen": -193.5737704918033, "logps/rejected": -412.65671641791045, "loss": 0.129, "rewards/chosen": 1.3545081967213115, "rewards/margins": 8.578388793736238, "rewards/rejected": -7.223880597014926, "step": 692 }, { "epoch": 0.47514569763455605, "grad_norm": 0.20598211530804308, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128758475.17460318, "logits/rejected": 94307312.24615385, "logps/chosen": -284.6984126984127, "logps/rejected": -364.8, "loss": 0.1606, "rewards/chosen": 2.0793650793650795, "rewards/margins": 9.063980463980464, "rewards/rejected": -6.984615384615385, "step": 693 }, { "epoch": 0.4758313335618786, "grad_norm": 0.1962915542449676, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139061853.7464789, "logits/rejected": 114570725.05263157, "logps/chosen": -238.19718309859155, "logps/rejected": -360.70175438596493, "loss": 0.2057, "rewards/chosen": 0.9198943661971831, "rewards/margins": 8.946210155670869, "rewards/rejected": -8.026315789473685, "step": 694 }, { "epoch": 0.47651696948920125, "grad_norm": 0.183226054759029, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110624768.0, "logits/rejected": 69337088.0, "logps/chosen": -258.0, "logps/rejected": -323.5, "loss": 0.194, "rewards/chosen": 1.34765625, "rewards/margins": 4.79296875, "rewards/rejected": -3.4453125, "step": 695 }, { "epoch": 0.4772026054165238, "grad_norm": 0.1869621010810507, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136830573.1147541, "logits/rejected": 80756002.3880597, "logps/chosen": -240.0, "logps/rejected": -397.3731343283582, "loss": 0.1617, "rewards/chosen": 1.1547131147540983, "rewards/margins": 8.669638487888427, "rewards/rejected": -7.514925373134329, "step": 696 }, { "epoch": 0.47788824134384644, "grad_norm": 0.24721391637713724, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137964639.57333332, "logits/rejected": 81670221.28301887, "logps/chosen": -245.76, "logps/rejected": -369.811320754717, "loss": 0.2003, "rewards/chosen": 1.7683333333333333, "rewards/margins": 6.6173899371069185, "rewards/rejected": -4.849056603773585, "step": 697 }, { "epoch": 0.478573877271169, "grad_norm": 0.21298125693995607, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107352945.41772152, "logits/rejected": 120950031.6734694, "logps/chosen": -258.2278481012658, "logps/rejected": -377.7959183673469, "loss": 0.2284, "rewards/chosen": 1.4691455696202531, "rewards/margins": 6.979349651252907, "rewards/rejected": -5.510204081632653, "step": 698 }, { "epoch": 0.4792595131984916, "grad_norm": 0.24071659661083594, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122647234.20689656, "logits/rejected": 93892490.97142857, "logps/chosen": -228.41379310344828, "logps/rejected": -347.8857142857143, "loss": 0.1656, "rewards/chosen": 1.4601293103448276, "rewards/margins": 8.410129310344828, "rewards/rejected": -6.95, "step": 699 }, { "epoch": 0.4799451491258142, "grad_norm": 0.1594704646513428, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160160274.96296296, "logits/rejected": 116590315.24324325, "logps/chosen": -269.037037037037, "logps/rejected": -373.6216216216216, "loss": 0.1472, "rewards/chosen": 1.65625, "rewards/margins": 9.318412162162161, "rewards/rejected": -7.662162162162162, "step": 700 }, { "epoch": 0.4806307850531368, "grad_norm": 0.16609322106929997, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105296095.41818182, "logits/rejected": 97273379.06849316, "logps/chosen": -288.0, "logps/rejected": -347.17808219178085, "loss": 0.1531, "rewards/chosen": 1.7488636363636363, "rewards/margins": 9.14612391033624, "rewards/rejected": -7.397260273972603, "step": 701 }, { "epoch": 0.4813164209804594, "grad_norm": 0.16571747600779063, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 61628570.56603774, "logits/rejected": 156251805.01333332, "logps/chosen": -241.50943396226415, "logps/rejected": -414.29333333333335, "loss": 0.1469, "rewards/chosen": 1.8443396226415094, "rewards/margins": 8.82433962264151, "rewards/rejected": -6.98, "step": 702 }, { "epoch": 0.482002056907782, "grad_norm": 0.2081435035334363, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111642503.52941176, "logits/rejected": 117370606.93333334, "logps/chosen": -243.05882352941177, "logps/rejected": -378.93333333333334, "loss": 0.1712, "rewards/chosen": 1.8161764705882353, "rewards/margins": 8.591176470588236, "rewards/rejected": -6.775, "step": 703 }, { "epoch": 0.48268769283510454, "grad_norm": 0.21082528183332022, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85983232.0, "logits/rejected": 71098368.0, "logps/chosen": -178.75, "logps/rejected": -310.5, "loss": 0.2001, "rewards/chosen": 0.7392578125, "rewards/margins": 8.5048828125, "rewards/rejected": -7.765625, "step": 704 }, { "epoch": 0.48337332876242717, "grad_norm": 0.1937627276074102, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92041671.1111111, "logits/rejected": 106430464.0, "logps/chosen": -164.88888888888889, "logps/rejected": -338.2857142857143, "loss": 0.2121, "rewards/chosen": 0.9244791666666666, "rewards/margins": 8.147693452380953, "rewards/rejected": -7.223214285714286, "step": 705 }, { "epoch": 0.48405896468974974, "grad_norm": 0.19157058429243023, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102175995.80327868, "logits/rejected": 94215336.11940299, "logps/chosen": -204.0655737704918, "logps/rejected": -373.97014925373134, "loss": 0.1937, "rewards/chosen": 1.0819672131147542, "rewards/margins": 8.58942989968192, "rewards/rejected": -7.507462686567164, "step": 706 }, { "epoch": 0.4847446006170723, "grad_norm": 0.2244915225198509, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124491805.68115942, "logits/rejected": 80615944.6779661, "logps/chosen": -278.4927536231884, "logps/rejected": -335.1864406779661, "loss": 0.1862, "rewards/chosen": 1.5353260869565217, "rewards/margins": 7.323461680176861, "rewards/rejected": -5.788135593220339, "step": 707 }, { "epoch": 0.48543023654439493, "grad_norm": 0.1973944272813684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130642255.73770492, "logits/rejected": 115312059.2238806, "logps/chosen": -238.1639344262295, "logps/rejected": -391.1641791044776, "loss": 0.1833, "rewards/chosen": 1.1869877049180328, "rewards/margins": 8.261614570589675, "rewards/rejected": -7.074626865671642, "step": 708 }, { "epoch": 0.4861158724717175, "grad_norm": 0.20127645139769082, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90643569.77777778, "logits/rejected": 91338459.42857143, "logps/chosen": -204.0, "logps/rejected": -305.42857142857144, "loss": 0.1906, "rewards/chosen": 0.8350694444444444, "rewards/margins": 8.210069444444445, "rewards/rejected": -7.375, "step": 709 }, { "epoch": 0.48680150839904013, "grad_norm": 0.17566692070374004, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 163111822.2222222, "logits/rejected": 120150677.66153847, "logps/chosen": -236.44444444444446, "logps/rejected": -337.4769230769231, "loss": 0.1457, "rewards/chosen": 2.0515873015873014, "rewards/margins": -32216154.502258852, "rewards/rejected": 32216156.553846154, "step": 710 }, { "epoch": 0.4874871443263627, "grad_norm": 0.205544245817563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125960192.0, "logits/rejected": 98041856.0, "logps/chosen": -213.5, "logps/rejected": -322.0, "loss": 0.1939, "rewards/chosen": 1.2763671875, "rewards/margins": 7.8935546875, "rewards/rejected": -6.6171875, "step": 711 }, { "epoch": 0.48817278025368527, "grad_norm": 0.19290686178697256, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 69625446.4, "logits/rejected": 130276528.55172414, "logps/chosen": -271.77142857142854, "logps/rejected": -350.3448275862069, "loss": 0.1997, "rewards/chosen": 1.2612165178571428, "rewards/margins": 8.700871690270935, "rewards/rejected": -7.439655172413793, "step": 712 }, { "epoch": 0.4888584161810079, "grad_norm": 0.21987371571813288, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156293012.21052632, "logits/rejected": 96291767.88732395, "logps/chosen": -291.9298245614035, "logps/rejected": -319.5492957746479, "loss": 0.162, "rewards/chosen": 1.575657894736842, "rewards/margins": 6.864390289103039, "rewards/rejected": -5.288732394366197, "step": 713 }, { "epoch": 0.48954405210833046, "grad_norm": 0.17163803553796927, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120201762.13333334, "logits/rejected": 124842224.94117647, "logps/chosen": -182.93333333333334, "logps/rejected": -392.47058823529414, "loss": 0.1729, "rewards/chosen": 0.9171875, "rewards/margins": 8.637775735294118, "rewards/rejected": -7.720588235294118, "step": 714 }, { "epoch": 0.4902296880356531, "grad_norm": 0.20511251317410928, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96262714.75409836, "logits/rejected": 99286061.85074627, "logps/chosen": -237.63934426229508, "logps/rejected": -336.7164179104478, "loss": 0.1765, "rewards/chosen": 1.180327868852459, "rewards/margins": 8.590775630046489, "rewards/rejected": -7.41044776119403, "step": 715 }, { "epoch": 0.49091532396297566, "grad_norm": 0.17812141886689303, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124070218.32258065, "logits/rejected": 100027795.39393939, "logps/chosen": -230.70967741935485, "logps/rejected": -385.93939393939394, "loss": 0.1827, "rewards/chosen": 1.3961693548387097, "rewards/margins": 8.752229960899315, "rewards/rejected": -7.356060606060606, "step": 716 }, { "epoch": 0.49160095989029823, "grad_norm": 0.17279485551785592, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82041539.49090908, "logits/rejected": 114567701.0410959, "logps/chosen": -226.61818181818182, "logps/rejected": -402.8493150684931, "loss": 0.1502, "rewards/chosen": 0.9926136363636363, "rewards/margins": 9.122750622665006, "rewards/rejected": -8.13013698630137, "step": 717 }, { "epoch": 0.49228659581762085, "grad_norm": 0.1916707573614944, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 175078366.96774194, "logits/rejected": 78230124.60606061, "logps/chosen": -206.19354838709677, "logps/rejected": -390.3030303030303, "loss": 0.1594, "rewards/chosen": 1.3326612903225807, "rewards/margins": 9.438721896383186, "rewards/rejected": -8.106060606060606, "step": 718 }, { "epoch": 0.4929722317449434, "grad_norm": 0.20637592376262, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148806611.47826087, "logits/rejected": 70165729.62711865, "logps/chosen": -260.6376811594203, "logps/rejected": -410.03389830508473, "loss": 0.2299, "rewards/chosen": 0.9384057971014492, "rewards/margins": 8.404507492016704, "rewards/rejected": -7.466101694915254, "step": 719 }, { "epoch": 0.49365786767226605, "grad_norm": 0.15170664107476675, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70070631.29824561, "logits/rejected": 114486776.7887324, "logps/chosen": -234.10526315789474, "logps/rejected": -388.9577464788732, "loss": 0.1568, "rewards/chosen": 1.388157894736842, "rewards/margins": 9.212101556708673, "rewards/rejected": -7.823943661971831, "step": 720 }, { "epoch": 0.4943435035995886, "grad_norm": 0.17465992161136074, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91209467.93650794, "logits/rejected": 111245847.63076924, "logps/chosen": -203.68253968253967, "logps/rejected": -373.66153846153844, "loss": 0.1703, "rewards/chosen": 1.3898809523809523, "rewards/margins": 5.474496336996337, "rewards/rejected": -4.084615384615384, "step": 721 }, { "epoch": 0.4950291395269112, "grad_norm": 0.21914185022751453, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160932218.0923077, "logits/rejected": 11684132.57142857, "logps/chosen": -251.3230769230769, "logps/rejected": -346.92063492063494, "loss": 0.2054, "rewards/chosen": 1.2836538461538463, "rewards/margins": 6.204288766788768, "rewards/rejected": -4.920634920634921, "step": 722 }, { "epoch": 0.4957147754542338, "grad_norm": 0.1970711269909451, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135416100.57142857, "logits/rejected": 136282616.12307692, "logps/chosen": -260.3174603174603, "logps/rejected": -353.96923076923076, "loss": 0.1793, "rewards/chosen": 0.8888888888888888, "rewards/margins": 7.935042735042735, "rewards/rejected": -7.046153846153846, "step": 723 }, { "epoch": 0.4964004113815564, "grad_norm": 0.19091701402196912, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 52567582.11764706, "logits/rejected": 150575513.6, "logps/chosen": -215.2941176470588, "logps/rejected": -445.8666666666667, "loss": 0.2009, "rewards/chosen": 1.2591911764705883, "rewards/margins": 9.875857843137256, "rewards/rejected": -8.616666666666667, "step": 724 }, { "epoch": 0.497086047308879, "grad_norm": 0.21370350796828405, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150501496.47058824, "logits/rejected": 80915114.66666667, "logps/chosen": -274.5882352941176, "logps/rejected": -340.53333333333336, "loss": 0.1772, "rewards/chosen": 1.5808823529411764, "rewards/margins": 6.73921568627451, "rewards/rejected": -5.158333333333333, "step": 725 }, { "epoch": 0.4977716832362016, "grad_norm": 0.16603502145173515, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139954176.0, "logits/rejected": 36995072.0, "logps/chosen": -268.5, "logps/rejected": -318.5, "loss": 0.157, "rewards/chosen": 1.998046875, "rewards/margins": 9.982421875, "rewards/rejected": -7.984375, "step": 726 }, { "epoch": 0.49845731916352415, "grad_norm": 0.1852014246825411, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155748488.53333333, "logits/rejected": 62174388.705882356, "logps/chosen": -223.46666666666667, "logps/rejected": -358.11764705882354, "loss": 0.1679, "rewards/chosen": 1.8145833333333334, "rewards/margins": 9.549877450980393, "rewards/rejected": -7.735294117647059, "step": 727 }, { "epoch": 0.4991429550908468, "grad_norm": 0.14951083629439763, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110607855.48387097, "logits/rejected": 74417120.96969697, "logps/chosen": -190.96774193548387, "logps/rejected": -320.969696969697, "loss": 0.1696, "rewards/chosen": 0.9193548387096774, "rewards/margins": 7.6542033235581615, "rewards/rejected": -6.734848484848484, "step": 728 }, { "epoch": 0.49982859101816934, "grad_norm": 0.25342035499303955, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146684131.55555555, "logits/rejected": 57821476.571428575, "logps/chosen": -248.88888888888889, "logps/rejected": -355.14285714285717, "loss": 0.1885, "rewards/chosen": 1.5998263888888888, "rewards/margins": 8.885540674603174, "rewards/rejected": -7.285714285714286, "step": 729 }, { "epoch": 0.500514226945492, "grad_norm": 0.15551542201910484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128583588.29850747, "logits/rejected": 65733682.36065574, "logps/chosen": -204.4179104477612, "logps/rejected": -315.5409836065574, "loss": 0.1907, "rewards/chosen": 1.5652985074626866, "rewards/margins": 8.778413261561047, "rewards/rejected": -7.213114754098361, "step": 730 }, { "epoch": 0.5011998628728145, "grad_norm": 0.20762816734834127, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119601214.06060606, "logits/rejected": 106887101.93548387, "logps/chosen": -233.21212121212122, "logps/rejected": -343.741935483871, "loss": 0.2136, "rewards/chosen": 1.0520833333333333, "rewards/margins": 6.519825268817204, "rewards/rejected": -5.467741935483871, "step": 731 }, { "epoch": 0.5018854988001371, "grad_norm": 0.17295522891851906, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116391936.0, "logits/rejected": 124011588.26666667, "logps/chosen": -220.7058823529412, "logps/rejected": -380.26666666666665, "loss": 0.1841, "rewards/chosen": 1.1047794117647058, "rewards/margins": 7.688112745098039, "rewards/rejected": -6.583333333333333, "step": 732 }, { "epoch": 0.5025711347274597, "grad_norm": 0.16345023589234547, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109119554.06451613, "logits/rejected": 95706391.27272727, "logps/chosen": -201.29032258064515, "logps/rejected": -376.72727272727275, "loss": 0.154, "rewards/chosen": 1.3548387096774193, "rewards/margins": 9.657869012707723, "rewards/rejected": -8.303030303030303, "step": 733 }, { "epoch": 0.5032567706547824, "grad_norm": 0.1696937515103778, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77655118.76923077, "logits/rejected": 119979169.68421052, "logps/chosen": -168.15384615384616, "logps/rejected": -373.05263157894734, "loss": 0.1388, "rewards/chosen": 1.2950721153846154, "rewards/margins": 9.229282641700404, "rewards/rejected": -7.934210526315789, "step": 734 }, { "epoch": 0.5039424065821049, "grad_norm": 0.19717869414536512, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 64111519.39622641, "logits/rejected": 107486030.50666666, "logps/chosen": -167.8490566037736, "logps/rejected": -357.12, "loss": 0.1624, "rewards/chosen": 0.9221698113207547, "rewards/margins": 8.84883647798742, "rewards/rejected": -7.926666666666667, "step": 735 }, { "epoch": 0.5046280425094275, "grad_norm": 0.1741710388385002, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107518165.97014925, "logits/rejected": 152851439.21311477, "logps/chosen": -271.76119402985074, "logps/rejected": -433.3114754098361, "loss": 0.1747, "rewards/chosen": 1.7555970149253732, "rewards/margins": 10.042482260827013, "rewards/rejected": -8.28688524590164, "step": 736 }, { "epoch": 0.5053136784367501, "grad_norm": 0.2266897454902369, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85008499.38028169, "logits/rejected": 89257732.49122807, "logps/chosen": -169.46478873239437, "logps/rejected": -363.2280701754386, "loss": 0.2237, "rewards/chosen": 0.4392605633802817, "rewards/margins": 7.2287342475908085, "rewards/rejected": -6.7894736842105265, "step": 737 }, { "epoch": 0.5059993143640726, "grad_norm": 0.26460690756491984, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126843870.96774194, "logits/rejected": 107145402.18181819, "logps/chosen": -305.80645161290323, "logps/rejected": -398.06060606060606, "loss": 0.1788, "rewards/chosen": 1.5151209677419355, "rewards/margins": 9.310575513196481, "rewards/rejected": -7.795454545454546, "step": 738 }, { "epoch": 0.5066849502913953, "grad_norm": 0.17995742345664584, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115576376.8888889, "logits/rejected": 88790189.29230769, "logps/chosen": -235.17460317460316, "logps/rejected": -341.66153846153844, "loss": 0.1581, "rewards/chosen": 0.8630952380952381, "rewards/margins": 8.870787545787547, "rewards/rejected": -8.007692307692308, "step": 739 }, { "epoch": 0.5073705862187179, "grad_norm": 0.20154530614531777, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128310257.57746479, "logits/rejected": 76196522.66666667, "logps/chosen": -204.8450704225352, "logps/rejected": -368.280701754386, "loss": 0.199, "rewards/chosen": 1.119718309859155, "rewards/margins": -6502935.020632568, "rewards/rejected": 6502936.140350877, "step": 740 }, { "epoch": 0.5080562221460404, "grad_norm": 0.15094212047635724, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122850556.28985508, "logits/rejected": 118933399.86440678, "logps/chosen": -223.53623188405797, "logps/rejected": -438.23728813559325, "loss": 0.1693, "rewards/chosen": 1.4021739130434783, "rewards/margins": 9.834377302873987, "rewards/rejected": -8.432203389830509, "step": 741 }, { "epoch": 0.508741858073363, "grad_norm": 0.1790109076778576, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95842976.47761194, "logits/rejected": 127754374.29508197, "logps/chosen": -197.73134328358208, "logps/rejected": -384.0, "loss": 0.1764, "rewards/chosen": 1.4347014925373134, "rewards/margins": 9.360931000734034, "rewards/rejected": -7.926229508196721, "step": 742 }, { "epoch": 0.5094274940006857, "grad_norm": 0.24678524436862098, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94476697.6, "logits/rejected": 117978243.28205128, "logps/chosen": -192.16, "logps/rejected": -402.46153846153845, "loss": 0.1248, "rewards/chosen": 1.75, "rewards/margins": 8.371794871794872, "rewards/rejected": -6.621794871794871, "step": 743 }, { "epoch": 0.5101131299280083, "grad_norm": 0.16954997373636893, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143272098.53968254, "logits/rejected": 74884458.33846153, "logps/chosen": -270.73015873015873, "logps/rejected": -347.0769230769231, "loss": 0.1741, "rewards/chosen": 1.4583333333333333, "rewards/margins": 8.66602564102564, "rewards/rejected": -7.207692307692308, "step": 744 }, { "epoch": 0.5107987658553308, "grad_norm": 0.23680595368087973, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99024896.0, "logits/rejected": 92151808.0, "logps/chosen": -214.0, "logps/rejected": -330.0, "loss": 0.2039, "rewards/chosen": 0.58544921875, "rewards/margins": 5.97607421875, "rewards/rejected": -5.390625, "step": 745 }, { "epoch": 0.5114844017826534, "grad_norm": 0.32779319587515804, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124468805.1891892, "logits/rejected": 99226358.51851852, "logps/chosen": -256.0, "logps/rejected": -395.25925925925924, "loss": 0.1972, "rewards/chosen": 1.3057432432432432, "rewards/margins": 6.759446946946946, "rewards/rejected": -5.453703703703703, "step": 746 }, { "epoch": 0.512170037709976, "grad_norm": 0.2164700711612044, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136585480.2580645, "logits/rejected": 122143216.48484848, "logps/chosen": -273.80645161290323, "logps/rejected": -381.57575757575756, "loss": 0.1994, "rewards/chosen": 1.5231854838709677, "rewards/margins": 6.621670332355817, "rewards/rejected": -5.098484848484849, "step": 747 }, { "epoch": 0.5128556736372986, "grad_norm": 0.18307223458197058, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131548625.45454545, "logits/rejected": 63117510.19354839, "logps/chosen": -272.24242424242425, "logps/rejected": -303.48387096774195, "loss": 0.1916, "rewards/chosen": 1.5397727272727273, "rewards/margins": 8.822030791788857, "rewards/rejected": -7.282258064516129, "step": 748 }, { "epoch": 0.5135413095646212, "grad_norm": 0.1692606434505429, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158643380.70588234, "logits/rejected": 71722598.4, "logps/chosen": -244.0, "logps/rejected": -412.26666666666665, "loss": 0.1858, "rewards/chosen": 1.5845588235294117, "rewards/margins": 10.226225490196079, "rewards/rejected": -8.641666666666667, "step": 749 }, { "epoch": 0.5142269454919438, "grad_norm": 0.1770150162935213, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 62914560.0, "logits/rejected": 154949573.4857143, "logps/chosen": -258.2068965517241, "logps/rejected": -364.8, "loss": 0.1812, "rewards/chosen": 1.9245689655172413, "rewards/margins": 8.65314039408867, "rewards/rejected": -6.728571428571429, "step": 750 }, { "epoch": 0.5149125814192663, "grad_norm": 0.16848883507931822, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135433468.28985506, "logits/rejected": 56018839.86440678, "logps/chosen": -261.3333333333333, "logps/rejected": -324.06779661016947, "loss": 0.1653, "rewards/chosen": 1.431159420289855, "rewards/margins": 8.185396708425449, "rewards/rejected": -6.754237288135593, "step": 751 }, { "epoch": 0.515598217346589, "grad_norm": 0.24043755821024068, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138755827.40983605, "logits/rejected": 87454368.47761194, "logps/chosen": -261.24590163934425, "logps/rejected": -380.65671641791045, "loss": 0.1743, "rewards/chosen": 1.6639344262295082, "rewards/margins": 8.857964276975776, "rewards/rejected": -7.1940298507462686, "step": 752 }, { "epoch": 0.5162838532739116, "grad_norm": 0.4004315133965171, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110208953.37931034, "logits/rejected": 99884353.82857142, "logps/chosen": -245.24137931034483, "logps/rejected": -326.4, "loss": 0.152, "rewards/chosen": 0.9989224137931034, "rewards/margins": 8.020350985221675, "rewards/rejected": -7.021428571428571, "step": 753 }, { "epoch": 0.5169694892012341, "grad_norm": 0.19452875536605674, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 181661494.55737704, "logits/rejected": 65324719.76119403, "logps/chosen": -249.44262295081967, "logps/rejected": -439.8805970149254, "loss": 0.1579, "rewards/chosen": 1.7377049180327868, "rewards/margins": 9.34964521654025, "rewards/rejected": -7.611940298507463, "step": 754 }, { "epoch": 0.5176551251285567, "grad_norm": 0.25967764373798075, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 172676269.2923077, "logits/rejected": 71502896.76190476, "logps/chosen": -241.23076923076923, "logps/rejected": -329.3968253968254, "loss": 0.1891, "rewards/chosen": 1.0711538461538461, "rewards/margins": 8.78543956043956, "rewards/rejected": -7.714285714285714, "step": 755 }, { "epoch": 0.5183407610558793, "grad_norm": 0.1643733519557323, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145582938.83870968, "logits/rejected": 107336052.36363636, "logps/chosen": -260.9032258064516, "logps/rejected": -390.3030303030303, "loss": 0.1507, "rewards/chosen": 1.4112903225806452, "rewards/margins": 9.373411534701857, "rewards/rejected": -7.962121212121212, "step": 756 }, { "epoch": 0.519026396983202, "grad_norm": 0.20100677782310394, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142241613.91304347, "logits/rejected": 92043645.83050847, "logps/chosen": -251.59420289855072, "logps/rejected": -315.66101694915255, "loss": 0.2176, "rewards/chosen": 1.1349637681159421, "rewards/margins": 8.584116310488824, "rewards/rejected": -7.4491525423728815, "step": 757 }, { "epoch": 0.5197120329105245, "grad_norm": 0.1778847752506466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105063877.24590164, "logits/rejected": 75090561.91044776, "logps/chosen": -210.36065573770492, "logps/rejected": -334.32835820895525, "loss": 0.1647, "rewards/chosen": 1.4436475409836065, "rewards/margins": 8.391408735013457, "rewards/rejected": -6.947761194029851, "step": 758 }, { "epoch": 0.5203976688378471, "grad_norm": 0.24025380869981253, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133285660.44444445, "logits/rejected": 54151460.571428575, "logps/chosen": -244.66666666666666, "logps/rejected": -317.42857142857144, "loss": 0.2031, "rewards/chosen": 1.6041666666666667, "rewards/margins": 6.175595238095238, "rewards/rejected": -4.571428571428571, "step": 759 }, { "epoch": 0.5210833047651697, "grad_norm": 0.16828857783973702, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88316682.81690142, "logits/rejected": 139221459.0877193, "logps/chosen": -229.85915492957747, "logps/rejected": -358.7368421052632, "loss": 0.193, "rewards/chosen": 1.5545774647887325, "rewards/margins": 9.580893254262417, "rewards/rejected": -8.026315789473685, "step": 760 }, { "epoch": 0.5217689406924922, "grad_norm": 0.38308207820183177, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122213340.68965517, "logits/rejected": 123672049.37142856, "logps/chosen": -243.0344827586207, "logps/rejected": -407.77142857142854, "loss": 0.1686, "rewards/chosen": 0.8577586206896551, "rewards/margins": 7.4006157635467975, "rewards/rejected": -6.542857142857143, "step": 761 }, { "epoch": 0.5224545766198149, "grad_norm": 0.15382198277497894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125829120.0, "logits/rejected": 130786024.72727273, "logps/chosen": -229.93548387096774, "logps/rejected": -341.3333333333333, "loss": 0.1651, "rewards/chosen": 1.5887096774193548, "rewards/margins": 8.467497556207233, "rewards/rejected": -6.878787878787879, "step": 762 }, { "epoch": 0.5231402125471375, "grad_norm": 0.2637221756129648, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 67508321.52380952, "logits/rejected": 110471514.58461538, "logps/chosen": -205.46031746031747, "logps/rejected": -371.6923076923077, "loss": 0.1854, "rewards/chosen": 1.1200396825396826, "rewards/margins": 7.981578144078144, "rewards/rejected": -6.861538461538461, "step": 763 }, { "epoch": 0.52382584847446, "grad_norm": 0.20792845194691734, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144024997.6470588, "logits/rejected": 70639069.86666666, "logps/chosen": -247.52941176470588, "logps/rejected": -362.1333333333333, "loss": 0.173, "rewards/chosen": 1.5790441176470589, "rewards/margins": 8.645710784313724, "rewards/rejected": -7.066666666666666, "step": 764 }, { "epoch": 0.5245114844017826, "grad_norm": 0.17283005212212976, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126735996.54054055, "logits/rejected": 57089137.777777776, "logps/chosen": -222.9189189189189, "logps/rejected": -354.3703703703704, "loss": 0.1835, "rewards/chosen": 1.478885135135135, "rewards/margins": 7.089996246246246, "rewards/rejected": -5.611111111111111, "step": 765 }, { "epoch": 0.5251971203291053, "grad_norm": 0.14679812177005228, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115129364.89795919, "logits/rejected": 108839534.17721519, "logps/chosen": -261.2244897959184, "logps/rejected": -403.44303797468353, "loss": 0.1404, "rewards/chosen": 0.7602040816326531, "rewards/margins": 8.342482562645312, "rewards/rejected": -7.582278481012659, "step": 766 }, { "epoch": 0.5258827562564279, "grad_norm": 0.18647991232377484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121634816.0, "logits/rejected": 98435072.0, "logps/chosen": -237.25, "logps/rejected": -354.0, "loss": 0.1546, "rewards/chosen": 1.94140625, "rewards/margins": 10.26953125, "rewards/rejected": -8.328125, "step": 767 }, { "epoch": 0.5265683921837504, "grad_norm": 0.22308831346789187, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116112315.73333333, "logits/rejected": 97270844.23529412, "logps/chosen": -209.33333333333334, "logps/rejected": -332.70588235294116, "loss": 0.1761, "rewards/chosen": 1.2708333333333333, "rewards/margins": 8.535539215686274, "rewards/rejected": -7.264705882352941, "step": 768 }, { "epoch": 0.527254028111073, "grad_norm": 0.16967302475645524, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 199126301.37704918, "logits/rejected": 70364144.71641791, "logps/chosen": -259.9344262295082, "logps/rejected": -343.4029850746269, "loss": 0.1659, "rewards/chosen": 1.6014344262295082, "rewards/margins": 8.95964338145339, "rewards/rejected": -7.358208955223881, "step": 769 }, { "epoch": 0.5279396640383957, "grad_norm": 0.1443586018988274, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73186721.18518518, "logits/rejected": 127416153.94594595, "logps/chosen": -187.11111111111111, "logps/rejected": -423.7837837837838, "loss": 0.1411, "rewards/chosen": 1.4641203703703705, "rewards/margins": 9.639796046046046, "rewards/rejected": -8.175675675675675, "step": 770 }, { "epoch": 0.5286252999657182, "grad_norm": 0.1713174547312798, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101056512.0, "logits/rejected": 124256256.0, "logps/chosen": -251.25, "logps/rejected": -342.5, "loss": 0.1644, "rewards/chosen": 1.43359375, "rewards/margins": 9.19921875, "rewards/rejected": -7.765625, "step": 771 }, { "epoch": 0.5293109358930408, "grad_norm": 0.25315164506088633, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157095749.8181818, "logits/rejected": 126877696.0, "logps/chosen": -278.7878787878788, "logps/rejected": -357.6774193548387, "loss": 0.1848, "rewards/chosen": 1.2424242424242424, "rewards/margins": 7.12952101661779, "rewards/rejected": -5.887096774193548, "step": 772 }, { "epoch": 0.5299965718203634, "grad_norm": 0.20371194370650775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146863241.5522388, "logits/rejected": 84298634.49180327, "logps/chosen": -242.62686567164178, "logps/rejected": -376.39344262295083, "loss": 0.1884, "rewards/chosen": 1.210820895522388, "rewards/margins": 6.36655860044042, "rewards/rejected": -5.155737704918033, "step": 773 }, { "epoch": 0.5306822077476859, "grad_norm": 0.2914835437093909, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147013909.69491526, "logits/rejected": 63347667.47826087, "logps/chosen": -220.47457627118644, "logps/rejected": -283.3623188405797, "loss": 0.1865, "rewards/chosen": 0.8763572563559322, "rewards/margins": 7.963313778095063, "rewards/rejected": -7.086956521739131, "step": 774 }, { "epoch": 0.5313678436750086, "grad_norm": 0.16137112043484095, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115375623.87692308, "logits/rejected": 108910429.46031746, "logps/chosen": -248.36923076923077, "logps/rejected": -405.8412698412698, "loss": 0.1926, "rewards/chosen": 1.1067307692307693, "rewards/margins": 8.122603785103784, "rewards/rejected": -7.015873015873016, "step": 775 }, { "epoch": 0.5320534796023312, "grad_norm": 0.20682879645747643, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126494882.53968254, "logits/rejected": 114988457.35384615, "logps/chosen": -216.63492063492063, "logps/rejected": -373.66153846153844, "loss": 0.1713, "rewards/chosen": 1.3194444444444444, "rewards/margins": 9.380982905982906, "rewards/rejected": -8.061538461538461, "step": 776 }, { "epoch": 0.5327391155296538, "grad_norm": 0.2949992224086108, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109440265.48148148, "logits/rejected": 117780590.7027027, "logps/chosen": -273.18518518518516, "logps/rejected": -390.05405405405406, "loss": 0.149, "rewards/chosen": 1.6634837962962963, "rewards/margins": 9.15672703953954, "rewards/rejected": -7.493243243243243, "step": 777 }, { "epoch": 0.5334247514569763, "grad_norm": 0.22566129501858195, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129208761.1076923, "logits/rejected": 130980457.65079366, "logps/chosen": -341.9076923076923, "logps/rejected": -359.1111111111111, "loss": 0.1999, "rewards/chosen": 1.0980769230769232, "rewards/margins": 6.621886446886447, "rewards/rejected": -5.523809523809524, "step": 778 }, { "epoch": 0.534110387384299, "grad_norm": 0.19791337796655017, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114621389.63934426, "logits/rejected": 89645422.80597015, "logps/chosen": -280.91803278688525, "logps/rejected": -318.089552238806, "loss": 0.1753, "rewards/chosen": 1.4139344262295082, "rewards/margins": 8.085576217274285, "rewards/rejected": -6.6716417910447765, "step": 779 }, { "epoch": 0.5347960233116216, "grad_norm": 0.21069167980903472, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104857600.0, "logits/rejected": 118203112.72727273, "logps/chosen": -188.1290322580645, "logps/rejected": -334.54545454545456, "loss": 0.1887, "rewards/chosen": 0.9637096774193549, "rewards/margins": 8.910679374389051, "rewards/rejected": -7.946969696969697, "step": 780 }, { "epoch": 0.5354816592389441, "grad_norm": 0.1543065619167995, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144359179.46268657, "logits/rejected": 87152136.39344262, "logps/chosen": -311.8805970149254, "logps/rejected": -412.327868852459, "loss": 0.185, "rewards/chosen": 1.7332089552238805, "rewards/margins": 9.561077807682898, "rewards/rejected": -7.827868852459017, "step": 781 }, { "epoch": 0.5361672951662667, "grad_norm": 0.20368146687896216, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 164996517.6470588, "logits/rejected": 87363857.06666666, "logps/chosen": -233.88235294117646, "logps/rejected": -400.0, "loss": 0.1903, "rewards/chosen": 1.2325367647058822, "rewards/margins": 9.449203431372549, "rewards/rejected": -8.216666666666667, "step": 782 }, { "epoch": 0.5368529310935893, "grad_norm": 0.22957356147780125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128752422.78787878, "logits/rejected": 90786386.58064516, "logps/chosen": -205.33333333333334, "logps/rejected": -357.16129032258067, "loss": 0.1692, "rewards/chosen": 1.7007575757575757, "rewards/margins": 9.03946725317693, "rewards/rejected": -7.338709677419355, "step": 783 }, { "epoch": 0.5375385670209119, "grad_norm": 0.21314720398327794, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 64196152.88888889, "logits/rejected": 138112438.85714287, "logps/chosen": -243.55555555555554, "logps/rejected": -370.2857142857143, "loss": 0.2128, "rewards/chosen": 1.3645833333333333, "rewards/margins": 7.025297619047619, "rewards/rejected": -5.660714285714286, "step": 784 }, { "epoch": 0.5382242029482345, "grad_norm": 0.1865782003943986, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118920854.58823529, "logits/rejected": 129673898.66666667, "logps/chosen": -291.52941176470586, "logps/rejected": -393.6, "loss": 0.1732, "rewards/chosen": 1.849264705882353, "rewards/margins": 9.849264705882353, "rewards/rejected": -8.0, "step": 785 }, { "epoch": 0.5389098388755571, "grad_norm": 0.19502168751006302, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135033287.1111111, "logits/rejected": 94222043.42857143, "logps/chosen": -244.88888888888889, "logps/rejected": -313.14285714285717, "loss": 0.1997, "rewards/chosen": 1.3923611111111112, "rewards/margins": 9.106646825396826, "rewards/rejected": -7.714285714285714, "step": 786 }, { "epoch": 0.5395954748028796, "grad_norm": 0.14831343234161007, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92679401.54385965, "logits/rejected": 123820580.05633803, "logps/chosen": -215.01754385964912, "logps/rejected": -345.23943661971833, "loss": 0.1808, "rewards/chosen": 1.125, "rewards/margins": 8.639084507042254, "rewards/rejected": -7.514084507042254, "step": 787 }, { "epoch": 0.5402811107302022, "grad_norm": 0.2532077637572844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104458142.47619048, "logits/rejected": 110471514.58461538, "logps/chosen": -271.74603174603175, "logps/rejected": -361.84615384615387, "loss": 0.1964, "rewards/chosen": 1.3835410466269842, "rewards/margins": 8.868156431242369, "rewards/rejected": -7.484615384615385, "step": 788 }, { "epoch": 0.5409667466575249, "grad_norm": 0.231889187285288, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156503880.59701493, "logits/rejected": 66833827.67213115, "logps/chosen": -237.8507462686567, "logps/rejected": -391.8688524590164, "loss": 0.1777, "rewards/chosen": 1.6823694029850746, "rewards/margins": 9.305320222657206, "rewards/rejected": -7.622950819672131, "step": 789 }, { "epoch": 0.5416523825848475, "grad_norm": 0.14852223691727906, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118111422.91525424, "logits/rejected": 104530869.79710145, "logps/chosen": -261.2881355932203, "logps/rejected": -367.30434782608694, "loss": 0.1888, "rewards/chosen": 1.0752118644067796, "rewards/margins": 8.821588676000983, "rewards/rejected": -7.746376811594203, "step": 790 }, { "epoch": 0.54233801851217, "grad_norm": 0.20831766609919114, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116814496.47761194, "logits/rejected": 119434525.37704918, "logps/chosen": -235.46268656716418, "logps/rejected": -433.3114754098361, "loss": 0.2178, "rewards/chosen": 0.9048507462686567, "rewards/margins": 8.158949106924394, "rewards/rejected": -7.254098360655738, "step": 791 }, { "epoch": 0.5430236544394926, "grad_norm": 0.17466308275069875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 69831482.38596492, "logits/rejected": 128901004.61971831, "logps/chosen": -198.17543859649123, "logps/rejected": -426.36619718309856, "loss": 0.1338, "rewards/chosen": 1.9989035087719298, "rewards/margins": 9.65383308623672, "rewards/rejected": -7.654929577464789, "step": 792 }, { "epoch": 0.5437092903668153, "grad_norm": 0.1696892857583663, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80200176.48484848, "logits/rejected": 114396259.09677419, "logps/chosen": -185.21212121212122, "logps/rejected": -395.0967741935484, "loss": 0.1732, "rewards/chosen": 1.5416666666666667, "rewards/margins": 8.799731182795698, "rewards/rejected": -7.258064516129032, "step": 793 }, { "epoch": 0.5443949262941378, "grad_norm": 0.16502343710744294, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116391936.0, "logits/rejected": 85131264.0, "logps/chosen": -215.5, "logps/rejected": -351.0, "loss": 0.1651, "rewards/chosen": 1.685546875, "rewards/margins": 7.544921875, "rewards/rejected": -5.859375, "step": 794 }, { "epoch": 0.5450805622214604, "grad_norm": 0.20565384862629923, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126651532.5490196, "logits/rejected": 94290132.77922077, "logps/chosen": -256.94117647058823, "logps/rejected": -356.15584415584414, "loss": 0.1498, "rewards/chosen": 1.3419117647058822, "rewards/margins": 8.861392284186401, "rewards/rejected": -7.51948051948052, "step": 795 }, { "epoch": 0.545766198148783, "grad_norm": 0.3503771276120829, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110729625.6, "logits/rejected": 79159165.96825397, "logps/chosen": -271.75384615384615, "logps/rejected": -364.1904761904762, "loss": 0.1553, "rewards/chosen": 1.9192307692307693, "rewards/margins": 9.18907203907204, "rewards/rejected": -7.26984126984127, "step": 796 }, { "epoch": 0.5464518340761055, "grad_norm": 0.14747577947696064, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85103781.16129032, "logits/rejected": 127227221.33333333, "logps/chosen": -216.25806451612902, "logps/rejected": -406.7878787878788, "loss": 0.1572, "rewards/chosen": 1.5161290322580645, "rewards/margins": 8.470674486803519, "rewards/rejected": -6.954545454545454, "step": 797 }, { "epoch": 0.5471374700034282, "grad_norm": 0.2157009603053552, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110642846.89655173, "logits/rejected": 115403278.62857144, "logps/chosen": -214.89655172413794, "logps/rejected": -373.9428571428571, "loss": 0.1828, "rewards/chosen": 1.3329741379310345, "rewards/margins": 8.561545566502463, "rewards/rejected": -7.228571428571429, "step": 798 }, { "epoch": 0.5478231059307508, "grad_norm": 0.1576447585402965, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141738548.96551725, "logits/rejected": 104153556.11428571, "logps/chosen": -267.3103448275862, "logps/rejected": -371.2, "loss": 0.1686, "rewards/chosen": 1.675646551724138, "rewards/margins": 8.554217980295567, "rewards/rejected": -6.878571428571429, "step": 799 }, { "epoch": 0.5485087418580734, "grad_norm": 0.1420684102529033, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109013773.96363637, "logits/rejected": 112211996.05479452, "logps/chosen": -224.5818181818182, "logps/rejected": -359.45205479452056, "loss": 0.1574, "rewards/chosen": 1.4522727272727274, "rewards/margins": 8.774190535491906, "rewards/rejected": -7.321917808219178, "step": 800 }, { "epoch": 0.5491943777853959, "grad_norm": 0.17862778177843547, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118924650.33846153, "logits/rejected": 87880655.23809524, "logps/chosen": -217.35384615384615, "logps/rejected": -360.63492063492066, "loss": 0.1509, "rewards/chosen": 1.35, "rewards/margins": 8.532539682539683, "rewards/rejected": -7.182539682539683, "step": 801 }, { "epoch": 0.5498800137127186, "grad_norm": 0.21349336032293784, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98211241.35384615, "logits/rejected": 123432374.85714285, "logps/chosen": -184.12307692307692, "logps/rejected": -358.0952380952381, "loss": 0.1681, "rewards/chosen": 1.2625, "rewards/margins": 8.675198412698412, "rewards/rejected": -7.412698412698413, "step": 802 }, { "epoch": 0.5505656496400412, "grad_norm": 0.21081159693334753, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162590960.94117647, "logits/rejected": 50716125.86666667, "logps/chosen": -229.64705882352942, "logps/rejected": -317.06666666666666, "loss": 0.1843, "rewards/chosen": 1.3189338235294117, "rewards/margins": -2446800.01439951, "rewards/rejected": 2446801.3333333335, "step": 803 }, { "epoch": 0.5512512855673637, "grad_norm": 0.22318865992044887, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106479397.54666667, "logits/rejected": 85627111.8490566, "logps/chosen": -225.70666666666668, "logps/rejected": -366.49056603773585, "loss": 0.1882, "rewards/chosen": 1.5383333333333333, "rewards/margins": 5.3119182389937105, "rewards/rejected": -3.7735849056603774, "step": 804 }, { "epoch": 0.5519369214946863, "grad_norm": 0.35067135072549, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 177838489.6, "logits/rejected": 74807997.36986302, "logps/chosen": -315.6363636363636, "logps/rejected": -380.4931506849315, "loss": 0.172, "rewards/chosen": 0.25227272727272726, "rewards/margins": -7857389.336768369, "rewards/rejected": 7857389.589041096, "step": 805 }, { "epoch": 0.5526225574220089, "grad_norm": 0.1487801298977187, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157386264.3809524, "logits/rejected": 67754141.53846154, "logps/chosen": -210.03174603174602, "logps/rejected": -353.7230769230769, "loss": 0.1822, "rewards/chosen": 1.2261904761904763, "rewards/margins": 9.333882783882784, "rewards/rejected": -8.107692307692307, "step": 806 }, { "epoch": 0.5533081933493315, "grad_norm": 0.1845354889665499, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160146152.72727272, "logits/rejected": 56893704.258064516, "logps/chosen": -276.3636363636364, "logps/rejected": -323.61290322580646, "loss": 0.1701, "rewards/chosen": 1.6628787878787878, "rewards/margins": 8.638685239491691, "rewards/rejected": -6.975806451612903, "step": 807 }, { "epoch": 0.5539938292766541, "grad_norm": 0.1893565647632578, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75815222.3030303, "logits/rejected": 101001546.32258065, "logps/chosen": -186.42424242424244, "logps/rejected": -339.35483870967744, "loss": 0.1875, "rewards/chosen": 1.1041666666666667, "rewards/margins": 8.096102150537634, "rewards/rejected": -6.991935483870968, "step": 808 }, { "epoch": 0.5546794652039767, "grad_norm": 0.17357797939490804, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139237140.98360655, "logits/rejected": 90146235.2238806, "logps/chosen": -212.19672131147541, "logps/rejected": -356.7761194029851, "loss": 0.1732, "rewards/chosen": 1.3709016393442623, "rewards/margins": 9.744035967702471, "rewards/rejected": -8.373134328358208, "step": 809 }, { "epoch": 0.5553651011312993, "grad_norm": 0.28588490882473616, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110100480.0, "logits/rejected": 69954998.85714285, "logps/chosen": -206.44444444444446, "logps/rejected": -339.42857142857144, "loss": 0.197, "rewards/chosen": 1.3541666666666667, "rewards/margins": 8.30952380952381, "rewards/rejected": -6.955357142857143, "step": 810 }, { "epoch": 0.5560507370586218, "grad_norm": 0.19333867746942848, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137766754.46153846, "logits/rejected": 95696357.05263157, "logps/chosen": -247.3846153846154, "logps/rejected": -351.57894736842104, "loss": 0.164, "rewards/chosen": 1.3939302884615385, "rewards/margins": 8.722877656882591, "rewards/rejected": -7.328947368421052, "step": 811 }, { "epoch": 0.5567363729859445, "grad_norm": 0.2174488123160433, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114777461.84126984, "logits/rejected": 76400860.55384615, "logps/chosen": -259.1746031746032, "logps/rejected": -367.0153846153846, "loss": 0.1642, "rewards/chosen": 1.4970238095238095, "rewards/margins": 9.35856227106227, "rewards/rejected": -7.861538461538461, "step": 812 }, { "epoch": 0.5574220089132671, "grad_norm": 0.16570351163706448, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 59738872.68571428, "logits/rejected": 114909466.48275863, "logps/chosen": -190.17142857142858, "logps/rejected": -360.0, "loss": 0.1767, "rewards/chosen": 1.1625, "rewards/margins": 9.334913793103448, "rewards/rejected": -8.172413793103448, "step": 813 }, { "epoch": 0.5581076448405896, "grad_norm": 0.22762851231464395, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157417472.0, "logits/rejected": 98238464.0, "logps/chosen": -249.25, "logps/rejected": -339.0, "loss": 0.1756, "rewards/chosen": 1.4853515625, "rewards/margins": 8.3837890625, "rewards/rejected": -6.8984375, "step": 814 }, { "epoch": 0.5587932807679122, "grad_norm": 0.19792012294660366, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86778703.44827586, "logits/rejected": 108692392.22857143, "logps/chosen": -242.48275862068965, "logps/rejected": -367.54285714285714, "loss": 0.1589, "rewards/chosen": 1.603448275862069, "rewards/margins": 8.639162561576354, "rewards/rejected": -7.035714285714286, "step": 815 }, { "epoch": 0.5594789166952349, "grad_norm": 0.16062505511070593, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137870831.48387095, "logits/rejected": 114135908.84848484, "logps/chosen": -219.09677419354838, "logps/rejected": -361.2121212121212, "loss": 0.1547, "rewards/chosen": 1.4495967741935485, "rewards/margins": 9.048081622678398, "rewards/rejected": -7.598484848484849, "step": 816 }, { "epoch": 0.5601645526225574, "grad_norm": 0.17461027927585615, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96017297.72307692, "logits/rejected": 111848106.66666667, "logps/chosen": -233.35384615384615, "logps/rejected": -382.984126984127, "loss": 0.1516, "rewards/chosen": 1.2442307692307693, "rewards/margins": 8.78391330891331, "rewards/rejected": -7.5396825396825395, "step": 817 }, { "epoch": 0.56085018854988, "grad_norm": 0.19429174957461656, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122852517.16129032, "logits/rejected": 129133723.15151516, "logps/chosen": -236.6451612903226, "logps/rejected": -364.1212121212121, "loss": 0.1783, "rewards/chosen": 1.3634072580645162, "rewards/margins": 7.651286045943304, "rewards/rejected": -6.287878787878788, "step": 818 }, { "epoch": 0.5615358244772026, "grad_norm": 0.16347979237896298, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117720132.26666667, "logits/rejected": 120031111.52941176, "logps/chosen": -208.0, "logps/rejected": -374.5882352941176, "loss": 0.162, "rewards/chosen": 0.7666666666666667, "rewards/margins": 7.854901960784313, "rewards/rejected": -7.088235294117647, "step": 819 }, { "epoch": 0.5622214604045251, "grad_norm": 0.15315691661445668, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114542629.23636363, "logits/rejected": 103765931.83561644, "logps/chosen": -199.85454545454544, "logps/rejected": -403.7260273972603, "loss": 0.1334, "rewards/chosen": 1.6272727272727272, "rewards/margins": 9.538231631382317, "rewards/rejected": -7.910958904109589, "step": 820 }, { "epoch": 0.5629070963318478, "grad_norm": 0.221128094496122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129761280.0, "logits/rejected": 67272704.0, "logps/chosen": -218.5, "logps/rejected": -336.0, "loss": 0.1861, "rewards/chosen": 1.3291015625, "rewards/margins": 9.1337890625, "rewards/rejected": -7.8046875, "step": 821 }, { "epoch": 0.5635927322591704, "grad_norm": 0.15669703483086939, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93197434.88, "logits/rejected": 90930359.79487179, "logps/chosen": -253.44, "logps/rejected": -374.15384615384613, "loss": 0.1265, "rewards/chosen": 1.84625, "rewards/margins": 9.551378205128206, "rewards/rejected": -7.705128205128205, "step": 822 }, { "epoch": 0.564278368186493, "grad_norm": 0.1539248174729263, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137712981.33333334, "logits/rejected": 49098029.176470585, "logps/chosen": -206.93333333333334, "logps/rejected": -366.11764705882354, "loss": 0.1751, "rewards/chosen": 1.3552083333333333, "rewards/margins": 8.171384803921569, "rewards/rejected": -6.8161764705882355, "step": 823 }, { "epoch": 0.5649640041138155, "grad_norm": 0.21865544222318756, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136830573.1147541, "logits/rejected": 139056632.35820895, "logps/chosen": -267.0163934426229, "logps/rejected": -363.46268656716416, "loss": 0.1595, "rewards/chosen": 1.264344262295082, "rewards/margins": -26011434.675954245, "rewards/rejected": 26011435.94029851, "step": 824 }, { "epoch": 0.5656496400411382, "grad_norm": 0.15957268733897406, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86640548.29850747, "logits/rejected": 111802267.27868852, "logps/chosen": -243.5820895522388, "logps/rejected": -385.3114754098361, "loss": 0.1589, "rewards/chosen": 1.7798507462686568, "rewards/margins": 8.107719598727673, "rewards/rejected": -6.327868852459017, "step": 825 }, { "epoch": 0.5663352759684608, "grad_norm": 0.19129299306558253, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70184686.93333334, "logits/rejected": 104055747.76470588, "logps/chosen": -307.46666666666664, "logps/rejected": -334.11764705882354, "loss": 0.1738, "rewards/chosen": 1.3395833333333333, "rewards/margins": 8.979289215686274, "rewards/rejected": -7.639705882352941, "step": 826 }, { "epoch": 0.5670209118957833, "grad_norm": 0.20442717131779395, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104918387.01449275, "logits/rejected": 105568498.98305085, "logps/chosen": -203.59420289855072, "logps/rejected": -426.8474576271187, "loss": 0.2137, "rewards/chosen": 0.9266304347826086, "rewards/margins": 8.892732129697864, "rewards/rejected": -7.966101694915254, "step": 827 }, { "epoch": 0.5677065478231059, "grad_norm": 0.1619644787771118, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89260032.0, "logits/rejected": 134086656.0, "logps/chosen": -194.625, "logps/rejected": -383.0, "loss": 0.1771, "rewards/chosen": 1.1904296875, "rewards/margins": 7.2294921875, "rewards/rejected": -6.0390625, "step": 828 }, { "epoch": 0.5683921837504285, "grad_norm": 0.18276497383174806, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115734354.44067797, "logits/rejected": 94584594.55072464, "logps/chosen": -206.77966101694915, "logps/rejected": -369.15942028985506, "loss": 0.151, "rewards/chosen": 1.74364406779661, "rewards/margins": 9.475528125767624, "rewards/rejected": -7.731884057971015, "step": 829 }, { "epoch": 0.5690778196777511, "grad_norm": 0.18278532369246314, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114688000.0, "logits/rejected": 104529920.0, "logps/chosen": -205.25, "logps/rejected": -326.0, "loss": 0.1725, "rewards/chosen": 1.03125, "rewards/margins": 8.546875, "rewards/rejected": -7.515625, "step": 830 }, { "epoch": 0.5697634556050737, "grad_norm": 0.188711999235838, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107772285.83050847, "logits/rejected": 90390290.55072464, "logps/chosen": -218.03389830508473, "logps/rejected": -366.84057971014494, "loss": 0.1577, "rewards/chosen": 1.465042372881356, "rewards/margins": 9.10996990911324, "rewards/rejected": -7.644927536231884, "step": 831 }, { "epoch": 0.5704490915323963, "grad_norm": 0.23405455182268595, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106498849.39130434, "logits/rejected": 99881307.11864407, "logps/chosen": -219.59420289855072, "logps/rejected": -391.0508474576271, "loss": 0.1956, "rewards/chosen": 1.5344202898550725, "rewards/margins": 11072807.500521984, "rewards/rejected": -11072805.966101695, "step": 832 }, { "epoch": 0.5711347274597189, "grad_norm": 0.17001876127576715, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148997656.3809524, "logits/rejected": 102534600.86153845, "logps/chosen": -257.77777777777777, "logps/rejected": -357.4153846153846, "loss": 0.1835, "rewards/chosen": 0.8591269841269841, "rewards/margins": 8.897588522588523, "rewards/rejected": -8.038461538461538, "step": 833 }, { "epoch": 0.5718203633870415, "grad_norm": 0.23227304530166912, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127982951.78378378, "logits/rejected": 94682529.18518518, "logps/chosen": -255.78378378378378, "logps/rejected": -381.037037037037, "loss": 0.2257, "rewards/chosen": 1.3918918918918919, "rewards/margins": 6.604854854854855, "rewards/rejected": -5.212962962962963, "step": 834 }, { "epoch": 0.5725059993143641, "grad_norm": 0.20696780639874907, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128382174.60869566, "logits/rejected": 81469023.45762712, "logps/chosen": -224.92753623188406, "logps/rejected": -336.8135593220339, "loss": 0.1687, "rewards/chosen": 1.6802536231884058, "rewards/margins": 7.587033284205354, "rewards/rejected": -5.906779661016949, "step": 835 }, { "epoch": 0.5731916352416867, "grad_norm": 0.18717744255836202, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158964121.6, "logits/rejected": 69391058.8235294, "logps/chosen": -226.4, "logps/rejected": -348.2352941176471, "loss": 0.1503, "rewards/chosen": 2.05625, "rewards/margins": 9.82095588235294, "rewards/rejected": -7.764705882352941, "step": 836 }, { "epoch": 0.5738772711690092, "grad_norm": 0.14527432881371397, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 163660097.25490198, "logits/rejected": 106219387.01298702, "logps/chosen": -236.86274509803923, "logps/rejected": -374.4415584415584, "loss": 0.1444, "rewards/chosen": 1.3590686274509804, "rewards/margins": 9.320107588489941, "rewards/rejected": -7.961038961038961, "step": 837 }, { "epoch": 0.5745629070963318, "grad_norm": 0.18555881678142647, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173725365.67741936, "logits/rejected": 47456007.75757576, "logps/chosen": -241.29032258064515, "logps/rejected": -341.3333333333333, "loss": 0.2105, "rewards/chosen": 0.9425403225806451, "rewards/margins": 8.55617668621701, "rewards/rejected": -7.613636363636363, "step": 838 }, { "epoch": 0.5752485430236545, "grad_norm": 0.17872413528414086, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118388539.61643836, "logits/rejected": 136467400.14545456, "logps/chosen": -224.21917808219177, "logps/rejected": -431.7090909090909, "loss": 0.1915, "rewards/chosen": 1.5256849315068493, "rewards/margins": 8.31659402241594, "rewards/rejected": -6.790909090909091, "step": 839 }, { "epoch": 0.575934178950977, "grad_norm": 0.1678217493200885, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167172973.7142857, "logits/rejected": 45811923.862068966, "logps/chosen": -255.77142857142857, "logps/rejected": -372.41379310344826, "loss": 0.1496, "rewards/chosen": 2.117857142857143, "rewards/margins": 8.24716748768473, "rewards/rejected": -6.129310344827586, "step": 840 }, { "epoch": 0.5766198148782996, "grad_norm": 0.17820947574767074, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 218393070.3448276, "logits/rejected": 14874799.542857142, "logps/chosen": -256.82758620689657, "logps/rejected": -350.62857142857143, "loss": 0.1297, "rewards/chosen": 2.009698275862069, "rewards/margins": 8.166841133004926, "rewards/rejected": -6.1571428571428575, "step": 841 }, { "epoch": 0.5773054508056222, "grad_norm": 0.23004485264937063, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158515764.96551725, "logits/rejected": 90177536.0, "logps/chosen": -280.82758620689657, "logps/rejected": -344.0, "loss": 0.1657, "rewards/chosen": 1.3265086206896552, "rewards/margins": 9.469365763546797, "rewards/rejected": -8.142857142857142, "step": 842 }, { "epoch": 0.5779910867329449, "grad_norm": 0.19878357608238384, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104055747.76470588, "logits/rejected": 103109973.33333333, "logps/chosen": -258.8235294117647, "logps/rejected": -427.73333333333335, "loss": 0.1898, "rewards/chosen": 1.59375, "rewards/margins": 8.202083333333334, "rewards/rejected": -6.608333333333333, "step": 843 }, { "epoch": 0.5786767226602674, "grad_norm": 0.19368219316116306, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79623016.91803278, "logits/rejected": 131150251.94029851, "logps/chosen": -226.62295081967213, "logps/rejected": -344.8358208955224, "loss": 0.1732, "rewards/chosen": 1.4405737704918034, "rewards/margins": 8.306245412282848, "rewards/rejected": -6.865671641791045, "step": 844 }, { "epoch": 0.57936235858759, "grad_norm": 0.24473687955984585, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104279994.57627119, "logits/rejected": 86803856.69565217, "logps/chosen": -169.89830508474577, "logps/rejected": -392.3478260869565, "loss": 0.1674, "rewards/chosen": 1.1504237288135593, "rewards/margins": 8.933032424465733, "rewards/rejected": -7.782608695652174, "step": 845 }, { "epoch": 0.5800479945149126, "grad_norm": 0.2741299940650214, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133239057.06666666, "logits/rejected": 114603188.70588236, "logps/chosen": -280.26666666666665, "logps/rejected": -379.29411764705884, "loss": 0.1956, "rewards/chosen": 1.38828125, "rewards/margins": 8.572104779411765, "rewards/rejected": -7.1838235294117645, "step": 846 }, { "epoch": 0.5807336304422351, "grad_norm": 0.2551262944059797, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 58246705.548387095, "logits/rejected": 105111800.24242425, "logps/chosen": -168.1290322580645, "logps/rejected": -364.1212121212121, "loss": 0.1775, "rewards/chosen": 1.404233870967742, "rewards/margins": 5838816.919385387, "rewards/rejected": -5838815.515151516, "step": 847 }, { "epoch": 0.5814192663695578, "grad_norm": 0.18606723818851367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137363456.0, "logits/rejected": 108871680.0, "logps/chosen": -223.0, "logps/rejected": -398.0, "loss": 0.153, "rewards/chosen": 1.8046875, "rewards/margins": 10.046875, "rewards/rejected": -8.2421875, "step": 848 }, { "epoch": 0.5821049022968804, "grad_norm": 0.1798415905312652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149523258.3859649, "logits/rejected": 91624866.25352113, "logps/chosen": -252.91228070175438, "logps/rejected": -387.6056338028169, "loss": 0.178, "rewards/chosen": 1.0032894736842106, "rewards/margins": 9.270895107487027, "rewards/rejected": -8.267605633802816, "step": 849 }, { "epoch": 0.5827905382242029, "grad_norm": 0.1890866171810473, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151388160.0, "logits/rejected": 72613888.0, "logps/chosen": -221.0, "logps/rejected": -371.5, "loss": 0.1824, "rewards/chosen": 1.455078125, "rewards/margins": 9.798828125, "rewards/rejected": -8.34375, "step": 850 }, { "epoch": 0.5834761741515255, "grad_norm": 0.14207172995560155, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134726128.4848485, "logits/rejected": 128332172.38709678, "logps/chosen": -259.1515151515151, "logps/rejected": -452.64516129032256, "loss": 0.1437, "rewards/chosen": 1.759469696969697, "rewards/margins": 10.87237292277615, "rewards/rejected": -9.112903225806452, "step": 851 }, { "epoch": 0.5841618100788482, "grad_norm": 0.22992101309263857, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127367031.46666667, "logits/rejected": 67016342.5882353, "logps/chosen": -211.2, "logps/rejected": -323.7647058823529, "loss": 0.1778, "rewards/chosen": 0.9604166666666667, "rewards/margins": 7.99718137254902, "rewards/rejected": -7.036764705882353, "step": 852 }, { "epoch": 0.5848474460061707, "grad_norm": 0.14327413333469816, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158970997.5081967, "logits/rejected": 43429826.86567164, "logps/chosen": -250.75409836065575, "logps/rejected": -355.82089552238807, "loss": 0.1648, "rewards/chosen": 1.625, "rewards/margins": 9.550373134328357, "rewards/rejected": -7.925373134328358, "step": 853 }, { "epoch": 0.5855330819334933, "grad_norm": 0.2153938137891542, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 171679182.9041096, "logits/rejected": 81369497.6, "logps/chosen": -350.6849315068493, "logps/rejected": -402.03636363636366, "loss": 0.2023, "rewards/chosen": 1.7893835616438356, "rewards/margins": 7.5348381070983805, "rewards/rejected": -5.745454545454545, "step": 854 }, { "epoch": 0.5862187178608159, "grad_norm": 0.17851354470802147, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142283697.23076922, "logits/rejected": 79075945.65079366, "logps/chosen": -266.83076923076925, "logps/rejected": -368.76190476190476, "loss": 0.1635, "rewards/chosen": 1.6423076923076922, "rewards/margins": 9.205799755799756, "rewards/rejected": -7.563492063492063, "step": 855 }, { "epoch": 0.5869043537881385, "grad_norm": 0.2077823069157212, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112180987.93650794, "logits/rejected": 123248009.84615384, "logps/chosen": -255.4920634920635, "logps/rejected": -366.7692307692308, "loss": 0.153, "rewards/chosen": 1.9404761904761905, "rewards/margins": 9.986630036630036, "rewards/rejected": -8.046153846153846, "step": 856 }, { "epoch": 0.5875899897154611, "grad_norm": 0.18763204483936327, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134905318.81967214, "logits/rejected": 67629239.40298508, "logps/chosen": -217.18032786885246, "logps/rejected": -343.4029850746269, "loss": 0.1627, "rewards/chosen": 1.5512295081967213, "rewards/margins": 7.476602642525079, "rewards/rejected": -5.925373134328358, "step": 857 }, { "epoch": 0.5882756256427837, "grad_norm": 0.13312906042968028, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128066082.13333334, "logits/rejected": 72413424.94117647, "logps/chosen": -206.66666666666666, "logps/rejected": -370.3529411764706, "loss": 0.1631, "rewards/chosen": 1.4770833333333333, "rewards/margins": 9.344730392156862, "rewards/rejected": -7.867647058823529, "step": 858 }, { "epoch": 0.5889612615701063, "grad_norm": 0.18297523394221052, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 171154663.22580644, "logits/rejected": 62088409.21212121, "logps/chosen": -254.70967741935485, "logps/rejected": -381.09090909090907, "loss": 0.1631, "rewards/chosen": 1.532258064516129, "rewards/margins": 8.017106549364613, "rewards/rejected": -6.484848484848484, "step": 859 }, { "epoch": 0.5896468974974288, "grad_norm": 0.1594514270824503, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120570589.6119403, "logits/rejected": 126172915.40983607, "logps/chosen": -244.53731343283582, "logps/rejected": -408.1311475409836, "loss": 0.1641, "rewards/chosen": 1.7397388059701493, "rewards/margins": 11.30531257646195, "rewards/rejected": -9.565573770491802, "step": 860 }, { "epoch": 0.5903325334247514, "grad_norm": 0.18571665893789013, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 164857474.16949153, "logits/rejected": 94238868.4057971, "logps/chosen": -216.8135593220339, "logps/rejected": -361.27536231884056, "loss": 0.1682, "rewards/chosen": 0.7161016949152542, "rewards/margins": 9.114652419552936, "rewards/rejected": -8.398550724637682, "step": 861 }, { "epoch": 0.5910181693520741, "grad_norm": 0.16835060320704456, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 63361494.03278688, "logits/rejected": 164767285.49253732, "logps/chosen": -200.13114754098362, "logps/rejected": -452.2985074626866, "loss": 0.1794, "rewards/chosen": 0.8565573770491803, "rewards/margins": 9.632676780034254, "rewards/rejected": -8.776119402985074, "step": 862 }, { "epoch": 0.5917038052793966, "grad_norm": 0.16014655213886322, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130378873.49152543, "logits/rejected": 92092326.95652173, "logps/chosen": -213.6949152542373, "logps/rejected": -403.4782608695652, "loss": 0.1623, "rewards/chosen": 1.2828389830508475, "rewards/margins": 8.90602738884795, "rewards/rejected": -7.6231884057971016, "step": 863 }, { "epoch": 0.5923894412067192, "grad_norm": 0.1690102819803243, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149684224.0, "logits/rejected": 58753024.0, "logps/chosen": -227.875, "logps/rejected": -354.0, "loss": 0.1771, "rewards/chosen": 1.08984375, "rewards/margins": 7.88671875, "rewards/rejected": -6.796875, "step": 864 }, { "epoch": 0.5930750771340418, "grad_norm": 0.23292730893016575, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149619762.36065573, "logits/rejected": 76499096.8358209, "logps/chosen": -230.29508196721312, "logps/rejected": -314.2686567164179, "loss": 0.1702, "rewards/chosen": 1.6229508196721312, "rewards/margins": 7.540861267433326, "rewards/rejected": -5.917910447761194, "step": 865 }, { "epoch": 0.5937607130613645, "grad_norm": 0.21813768573523487, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109341166.34482759, "logits/rejected": 87780790.85714285, "logps/chosen": -259.86206896551727, "logps/rejected": -385.37142857142857, "loss": 0.1414, "rewards/chosen": 1.5581896551724137, "rewards/margins": 9.022475369458128, "rewards/rejected": -7.464285714285714, "step": 866 }, { "epoch": 0.594446348988687, "grad_norm": 0.23941210025326645, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146407424.0, "logits/rejected": 60751872.0, "logps/chosen": -245.375, "logps/rejected": -344.75, "loss": 0.1846, "rewards/chosen": 1.654296875, "rewards/margins": 8.873046875, "rewards/rejected": -7.21875, "step": 867 }, { "epoch": 0.5951319849160096, "grad_norm": 0.15093604692277254, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115457750.10909091, "logits/rejected": 74233435.1780822, "logps/chosen": -217.6, "logps/rejected": -398.4657534246575, "loss": 0.1623, "rewards/chosen": 1.0727272727272728, "rewards/margins": 8.812453300124533, "rewards/rejected": -7.739726027397261, "step": 868 }, { "epoch": 0.5958176208433322, "grad_norm": 0.1538397961204854, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92822807.27272727, "logits/rejected": 109187204.12903225, "logps/chosen": -212.6060606060606, "logps/rejected": -379.35483870967744, "loss": 0.1768, "rewards/chosen": 1.3920454545454546, "rewards/margins": 10.142045454545455, "rewards/rejected": -8.75, "step": 869 }, { "epoch": 0.5965032567706547, "grad_norm": 0.20977497425081945, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124431018.66666667, "logits/rejected": 78338774.70967741, "logps/chosen": -218.1818181818182, "logps/rejected": -401.5483870967742, "loss": 0.196, "rewards/chosen": 0.45928030303030304, "rewards/margins": 8.459280303030303, "rewards/rejected": -8.0, "step": 870 }, { "epoch": 0.5971888926979774, "grad_norm": 0.1688727076424653, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148331893.84126985, "logits/rejected": 74561819.56923077, "logps/chosen": -259.8095238095238, "logps/rejected": -349.53846153846155, "loss": 0.1443, "rewards/chosen": 2.125, "rewards/margins": 10.432692307692308, "rewards/rejected": -8.307692307692308, "step": 871 }, { "epoch": 0.5978745286253, "grad_norm": 0.20762652794678624, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118797492.70588236, "logits/rejected": 92064972.8, "logps/chosen": -186.35294117647058, "logps/rejected": -344.0, "loss": 0.1848, "rewards/chosen": 1.1355698529411764, "rewards/margins": 8.26890318627451, "rewards/rejected": -7.133333333333334, "step": 872 }, { "epoch": 0.5985601645526225, "grad_norm": 0.16963227368096978, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123370390.06896552, "logits/rejected": 78014054.4, "logps/chosen": -247.44827586206895, "logps/rejected": -365.25714285714287, "loss": 0.1493, "rewards/chosen": 1.6567887931034482, "rewards/margins": 9.64964593596059, "rewards/rejected": -7.992857142857143, "step": 873 }, { "epoch": 0.5992458004799451, "grad_norm": 0.1780953000929894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92145632.4923077, "logits/rejected": 167905312.5079365, "logps/chosen": -259.81538461538463, "logps/rejected": -481.015873015873, "loss": 0.15, "rewards/chosen": 1.6307692307692307, "rewards/margins": 11.05934065934066, "rewards/rejected": -9.428571428571429, "step": 874 }, { "epoch": 0.5999314364072678, "grad_norm": 0.15987792403700374, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 211740036.41379312, "logits/rejected": 34423252.114285715, "logps/chosen": -275.0344827586207, "logps/rejected": -314.9714285714286, "loss": 0.1358, "rewards/chosen": 1.7273706896551724, "rewards/margins": 10.255942118226601, "rewards/rejected": -8.528571428571428, "step": 875 }, { "epoch": 0.6006170723345904, "grad_norm": 0.21214731784587063, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 170182319.76119402, "logits/rejected": 107917379.14754099, "logps/chosen": -313.07462686567163, "logps/rejected": -449.57377049180326, "loss": 0.1661, "rewards/chosen": 1.6977611940298507, "rewards/margins": 9.607597259603622, "rewards/rejected": -7.909836065573771, "step": 876 }, { "epoch": 0.6013027082619129, "grad_norm": 0.19011890859992878, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107094562.13333334, "logits/rejected": 94495201.88235295, "logps/chosen": -175.2, "logps/rejected": -338.8235294117647, "loss": 0.1618, "rewards/chosen": 1.403125, "rewards/margins": 9.138419117647059, "rewards/rejected": -7.735294117647059, "step": 877 }, { "epoch": 0.6019883441892355, "grad_norm": 0.1704999864922845, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153349640.98245615, "logits/rejected": 88198533.40845071, "logps/chosen": -221.47368421052633, "logps/rejected": -362.36619718309856, "loss": 0.1689, "rewards/chosen": 1.037280701754386, "rewards/margins": 9.783759574993821, "rewards/rejected": -8.746478873239436, "step": 878 }, { "epoch": 0.6026739801165582, "grad_norm": 0.2467838811326254, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87674483.61290322, "logits/rejected": 115788210.42424242, "logps/chosen": -186.19354838709677, "logps/rejected": -400.969696969697, "loss": 0.1461, "rewards/chosen": 1.346774193548387, "rewards/margins": 7.217986314760508, "rewards/rejected": -5.871212121212121, "step": 879 }, { "epoch": 0.6033596160438807, "grad_norm": 0.19891476291672858, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 181193932.8, "logits/rejected": 29027246.73015873, "logps/chosen": -232.6153846153846, "logps/rejected": -321.5238095238095, "loss": 0.1944, "rewards/chosen": 1.3788461538461538, "rewards/margins": 4.942338217338218, "rewards/rejected": -3.5634920634920637, "step": 880 }, { "epoch": 0.6040452519712033, "grad_norm": 0.18776422046276858, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 165212923.66101694, "logits/rejected": 86986217.73913044, "logps/chosen": -249.22033898305085, "logps/rejected": -395.1304347826087, "loss": 0.1613, "rewards/chosen": 1.583686440677966, "rewards/margins": 9.26484586096782, "rewards/rejected": -7.681159420289855, "step": 881 }, { "epoch": 0.6047308878985259, "grad_norm": 0.19841633366174563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154539426.2535211, "logits/rejected": 74761629.19298245, "logps/chosen": -288.0, "logps/rejected": -320.280701754386, "loss": 0.1984, "rewards/chosen": 1.2403169014084507, "rewards/margins": 5.9771590066716085, "rewards/rejected": -4.7368421052631575, "step": 882 }, { "epoch": 0.6054165238258484, "grad_norm": 0.26245709977674053, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134217728.0, "logits/rejected": 92818394.07407407, "logps/chosen": -262.9189189189189, "logps/rejected": -395.55555555555554, "loss": 0.1789, "rewards/chosen": 1.7652027027027026, "rewards/margins": 7.848536036036036, "rewards/rejected": -6.083333333333333, "step": 883 }, { "epoch": 0.606102159753171, "grad_norm": 0.14877547353180623, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142027811.31034482, "logits/rejected": 115523115.88571429, "logps/chosen": -276.9655172413793, "logps/rejected": -359.77142857142854, "loss": 0.1359, "rewards/chosen": 1.8836206896551724, "rewards/margins": 9.84076354679803, "rewards/rejected": -7.957142857142857, "step": 884 }, { "epoch": 0.6067877956804937, "grad_norm": 0.19693205692785745, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168015308.057971, "logits/rejected": 81255753.76271187, "logps/chosen": -259.4782608695652, "logps/rejected": -396.47457627118644, "loss": 0.1855, "rewards/chosen": 1.3654891304347827, "rewards/margins": 10.543455232129698, "rewards/rejected": -9.177966101694915, "step": 885 }, { "epoch": 0.6074734316078162, "grad_norm": 0.16903283768374974, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108277570.95384616, "logits/rejected": 111848106.66666667, "logps/chosen": -226.46153846153845, "logps/rejected": -362.1587301587302, "loss": 0.1714, "rewards/chosen": 1.603846153846154, "rewards/margins": 10.040354090354091, "rewards/rejected": -8.436507936507937, "step": 886 }, { "epoch": 0.6081590675351388, "grad_norm": 0.22288030975999176, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119240306.62686567, "logits/rejected": 75875646.95081967, "logps/chosen": -258.86567164179104, "logps/rejected": -376.655737704918, "loss": 0.2098, "rewards/chosen": 1.0415111940298507, "rewards/margins": 7.000527587472473, "rewards/rejected": -5.959016393442623, "step": 887 }, { "epoch": 0.6088447034624614, "grad_norm": 0.2120403754188035, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128879522.9090909, "logits/rejected": 103217085.93548387, "logps/chosen": -229.8181818181818, "logps/rejected": -386.5806451612903, "loss": 0.2003, "rewards/chosen": 0.9332386363636364, "rewards/margins": 9.239690249266863, "rewards/rejected": -8.306451612903226, "step": 888 }, { "epoch": 0.6095303393897841, "grad_norm": 0.20974032528023265, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141041297.19402984, "logits/rejected": 78660389.77049181, "logps/chosen": -262.44776119402985, "logps/rejected": -378.75409836065575, "loss": 0.1715, "rewards/chosen": 1.3526119402985075, "rewards/margins": 7.418185710790311, "rewards/rejected": -6.065573770491803, "step": 889 }, { "epoch": 0.6102159753171066, "grad_norm": 0.18540537914499017, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150208512.0, "logits/rejected": 98304000.0, "logps/chosen": -248.5, "logps/rejected": -371.0, "loss": 0.1709, "rewards/chosen": 1.724609375, "rewards/margins": 9.279296875, "rewards/rejected": -7.5546875, "step": 890 }, { "epoch": 0.6109016112444292, "grad_norm": 0.21432052538543542, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110259355.15151516, "logits/rejected": 91674293.67741935, "logps/chosen": -192.0, "logps/rejected": -398.96774193548384, "loss": 0.1841, "rewards/chosen": 1.371212121212121, "rewards/margins": 8.967986314760509, "rewards/rejected": -7.596774193548387, "step": 891 }, { "epoch": 0.6115872471717518, "grad_norm": 0.22550963173057667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108797703.75757575, "logits/rejected": 112637357.41935484, "logps/chosen": -231.75757575757575, "logps/rejected": -347.35483870967744, "loss": 0.1932, "rewards/chosen": 0.8948863636363636, "rewards/margins": 8.48359604105572, "rewards/rejected": -7.588709677419355, "step": 892 }, { "epoch": 0.6122728830990743, "grad_norm": 0.16806700762804172, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115126413.2413793, "logits/rejected": 94252002.74285714, "logps/chosen": -294.8965517241379, "logps/rejected": -420.57142857142856, "loss": 0.1844, "rewards/chosen": 1.7435344827586208, "rewards/margins": 9.386391625615763, "rewards/rejected": -7.642857142857143, "step": 893 }, { "epoch": 0.612958519026397, "grad_norm": 0.18400577182565372, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 62141020.32786885, "logits/rejected": 138599836.6567164, "logps/chosen": -208.52459016393442, "logps/rejected": -444.65671641791045, "loss": 0.1659, "rewards/chosen": 1.5102459016393444, "rewards/margins": 8.510245901639344, "rewards/rejected": -7.0, "step": 894 }, { "epoch": 0.6136441549537196, "grad_norm": 0.1546696543204609, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117898072.43636364, "logits/rejected": 129851055.34246576, "logps/chosen": -221.38181818181818, "logps/rejected": -381.36986301369865, "loss": 0.1797, "rewards/chosen": 0.6954545454545454, "rewards/margins": 7.037920298879203, "rewards/rejected": -6.342465753424658, "step": 895 }, { "epoch": 0.6143297908810421, "grad_norm": 0.17342412806753207, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141544486.88607594, "logits/rejected": 103830423.51020408, "logps/chosen": -265.5189873417722, "logps/rejected": -316.40816326530614, "loss": 0.2001, "rewards/chosen": 1.6052215189873418, "rewards/margins": 8.737874580211832, "rewards/rejected": -7.13265306122449, "step": 896 }, { "epoch": 0.6150154268083647, "grad_norm": 0.23082290813777837, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127447974.1754386, "logits/rejected": 106688915.83098592, "logps/chosen": -263.859649122807, "logps/rejected": -420.9577464788732, "loss": 0.1356, "rewards/chosen": 1.6140350877192982, "rewards/margins": 9.874598468000988, "rewards/rejected": -8.26056338028169, "step": 897 }, { "epoch": 0.6157010627356874, "grad_norm": 0.1850197569119644, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134587813.6470588, "logits/rejected": 85843421.86666666, "logps/chosen": -218.11764705882354, "logps/rejected": -392.26666666666665, "loss": 0.1815, "rewards/chosen": 1.6397058823529411, "rewards/margins": 7.723039215686274, "rewards/rejected": -6.083333333333333, "step": 898 }, { "epoch": 0.61638669866301, "grad_norm": 0.30849467139381354, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92862639.54285714, "logits/rejected": 98349197.2413793, "logps/chosen": -226.74285714285713, "logps/rejected": -395.0344827586207, "loss": 0.1933, "rewards/chosen": 1.1517857142857142, "rewards/margins": 10.255233990147783, "rewards/rejected": -9.10344827586207, "step": 899 }, { "epoch": 0.6170723345903325, "grad_norm": 0.1845164478255075, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101095062.58823529, "logits/rejected": 124116445.86666666, "logps/chosen": -226.35294117647058, "logps/rejected": -386.1333333333333, "loss": 0.1882, "rewards/chosen": 1.3327205882352942, "rewards/margins": 4.949387254901961, "rewards/rejected": -3.6166666666666667, "step": 900 }, { "epoch": 0.6177579705176551, "grad_norm": 0.1640179532166466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99731228.44444445, "logits/rejected": 91242243.93846154, "logps/chosen": -199.11111111111111, "logps/rejected": -335.75384615384615, "loss": 0.136, "rewards/chosen": 1.7658730158730158, "rewards/margins": 9.627411477411478, "rewards/rejected": -7.861538461538461, "step": 901 }, { "epoch": 0.6184436064449778, "grad_norm": 0.22518264469014335, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135124604.54054055, "logits/rejected": 118760941.03703703, "logps/chosen": -245.6216216216216, "logps/rejected": -372.44444444444446, "loss": 0.1874, "rewards/chosen": 1.6047297297297298, "rewards/margins": 8.280655655655655, "rewards/rejected": -6.675925925925926, "step": 902 }, { "epoch": 0.6191292423723003, "grad_norm": 0.21933606469897968, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 165212923.66101694, "logits/rejected": 32946561.855072465, "logps/chosen": -295.3220338983051, "logps/rejected": -322.3188405797101, "loss": 0.1826, "rewards/chosen": 1.5693855932203389, "rewards/margins": 8.489675448292802, "rewards/rejected": -6.920289855072464, "step": 903 }, { "epoch": 0.6198148782996229, "grad_norm": 0.1768615380182137, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85204289.82857142, "logits/rejected": 138339716.41379312, "logps/chosen": -205.4857142857143, "logps/rejected": -379.0344827586207, "loss": 0.1725, "rewards/chosen": 1.3133928571428573, "rewards/margins": 9.07201354679803, "rewards/rejected": -7.758620689655173, "step": 904 }, { "epoch": 0.6205005142269455, "grad_norm": 0.19560733699676663, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90374144.0, "logits/rejected": 118882304.0, "logps/chosen": -221.125, "logps/rejected": -380.0, "loss": 0.1793, "rewards/chosen": 1.65234375, "rewards/margins": 3.80859375, "rewards/rejected": -2.15625, "step": 905 }, { "epoch": 0.621186150154268, "grad_norm": 0.14432721248971195, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144674759.89041096, "logits/rejected": 117364251.92727272, "logps/chosen": -282.73972602739724, "logps/rejected": -452.6545454545454, "loss": 0.1784, "rewards/chosen": 1.8304794520547945, "rewards/margins": 8.321388542963886, "rewards/rejected": -6.490909090909091, "step": 906 }, { "epoch": 0.6218717860815907, "grad_norm": 0.22007388291338026, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116210182.82666667, "logits/rejected": 83332115.32075472, "logps/chosen": -250.88, "logps/rejected": -316.07547169811323, "loss": 0.2101, "rewards/chosen": 1.5483333333333333, "rewards/margins": 8.538899371069183, "rewards/rejected": -6.990566037735849, "step": 907 }, { "epoch": 0.6225574220089133, "grad_norm": 0.18934422207939608, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125500154.98039216, "logits/rejected": 81952342.44155844, "logps/chosen": -260.3921568627451, "logps/rejected": -339.1168831168831, "loss": 0.143, "rewards/chosen": 1.8651960784313726, "rewards/margins": 9.397663610898906, "rewards/rejected": -7.532467532467533, "step": 908 }, { "epoch": 0.6232430579362359, "grad_norm": 0.2037599637274335, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138229670.95652175, "logits/rejected": 114774640.81355932, "logps/chosen": -276.17391304347825, "logps/rejected": -435.52542372881356, "loss": 0.1764, "rewards/chosen": 1.3804347826086956, "rewards/margins": 7.126197494473102, "rewards/rejected": -5.745762711864407, "step": 909 }, { "epoch": 0.6239286938635584, "grad_norm": 0.242362760263282, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167461470.8148148, "logits/rejected": 114946601.51351352, "logps/chosen": -256.8888888888889, "logps/rejected": -407.7837837837838, "loss": 0.1429, "rewards/chosen": 1.7986111111111112, "rewards/margins": 9.731043543543544, "rewards/rejected": -7.9324324324324325, "step": 910 }, { "epoch": 0.624614329790881, "grad_norm": 0.19149787698411372, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126538016.45070423, "logits/rejected": 56715084.350877196, "logps/chosen": -218.14084507042253, "logps/rejected": -377.2631578947368, "loss": 0.1814, "rewards/chosen": 1.5774647887323943, "rewards/margins": 10.86693847294292, "rewards/rejected": -9.289473684210526, "step": 911 }, { "epoch": 0.6252999657182037, "grad_norm": 0.23672150967631048, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132775936.0, "logits/rejected": 92536832.0, "logps/chosen": -288.0, "logps/rejected": -325.25, "loss": 0.1788, "rewards/chosen": 1.8251953125, "rewards/margins": 9.4501953125, "rewards/rejected": -7.625, "step": 912 }, { "epoch": 0.6259856016455262, "grad_norm": 0.19870820292587085, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138715967.07246378, "logits/rejected": 94958331.66101696, "logps/chosen": -254.6086956521739, "logps/rejected": -340.06779661016947, "loss": 0.1828, "rewards/chosen": 1.6358695652173914, "rewards/margins": 8.661293294030951, "rewards/rejected": -7.02542372881356, "step": 913 }, { "epoch": 0.6266712375728488, "grad_norm": 0.17658977982269047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128284322.34146342, "logits/rejected": 107866557.2173913, "logps/chosen": -205.85365853658536, "logps/rejected": -418.7826086956522, "loss": 0.1907, "rewards/chosen": 2.033536585365854, "rewards/margins": -1641240.9229851537, "rewards/rejected": 1641242.956521739, "step": 914 }, { "epoch": 0.6273568735001714, "grad_norm": 0.2036933130789464, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 163438045.86666667, "logits/rejected": 59236833.88235294, "logps/chosen": -289.8666666666667, "logps/rejected": -380.2352941176471, "loss": 0.1657, "rewards/chosen": 1.7104166666666667, "rewards/margins": 10.47512254901961, "rewards/rejected": -8.764705882352942, "step": 915 }, { "epoch": 0.628042509427494, "grad_norm": 0.1847608103849063, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150732800.0, "logits/rejected": 132513792.0, "logps/chosen": -264.0, "logps/rejected": -320.0, "loss": 0.1863, "rewards/chosen": 1.68359375, "rewards/margins": 9.05078125, "rewards/rejected": -7.3671875, "step": 916 }, { "epoch": 0.6287281453548166, "grad_norm": 0.2969415981362602, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124978101.79710145, "logits/rejected": 76954814.91525424, "logps/chosen": -302.1449275362319, "logps/rejected": -360.135593220339, "loss": 0.1913, "rewards/chosen": 1.9057121829710144, "rewards/margins": 10.575203708394742, "rewards/rejected": -8.669491525423728, "step": 917 }, { "epoch": 0.6294137812821392, "grad_norm": 0.18330476157857586, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75226871.74193548, "logits/rejected": 175398167.27272728, "logps/chosen": -249.03225806451613, "logps/rejected": -419.3939393939394, "loss": 0.1562, "rewards/chosen": 1.6189516129032258, "rewards/margins": 10.906830400782013, "rewards/rejected": -9.287878787878787, "step": 918 }, { "epoch": 0.6300994172094617, "grad_norm": 0.15676552827863352, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125829120.0, "logits/rejected": 122701164.47457626, "logps/chosen": -246.02898550724638, "logps/rejected": -398.3728813559322, "loss": 0.1713, "rewards/chosen": 1.8713768115942029, "rewards/margins": 9.786631048882338, "rewards/rejected": -7.915254237288136, "step": 919 }, { "epoch": 0.6307850531367843, "grad_norm": 0.1904414321911457, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111524665.31343284, "logits/rejected": 122253647.73770492, "logps/chosen": -300.17910447761193, "logps/rejected": -395.0163934426229, "loss": 0.1691, "rewards/chosen": 2.103544776119403, "rewards/margins": 9.906823464643992, "rewards/rejected": -7.80327868852459, "step": 920 }, { "epoch": 0.631470689064107, "grad_norm": 0.19902501007104972, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112918528.0, "logits/rejected": 84901888.0, "logps/chosen": -180.125, "logps/rejected": -385.0, "loss": 0.1791, "rewards/chosen": 0.84375, "rewards/margins": 7.640625, "rewards/rejected": -6.796875, "step": 921 }, { "epoch": 0.6321563249914296, "grad_norm": 0.168144145306783, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122333866.66666667, "logits/rejected": 100663296.0, "logps/chosen": -238.66666666666666, "logps/rejected": -359.05882352941177, "loss": 0.155, "rewards/chosen": 1.628125, "rewards/margins": 10.186948529411765, "rewards/rejected": -8.558823529411764, "step": 922 }, { "epoch": 0.6328419609187521, "grad_norm": 0.17123601681407113, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128843776.0, "logits/rejected": 91750400.0, "logps/chosen": -276.5, "logps/rejected": -401.5, "loss": 0.1429, "rewards/chosen": 1.224609375, "rewards/margins": 9.630859375, "rewards/rejected": -8.40625, "step": 923 }, { "epoch": 0.6335275968460747, "grad_norm": 0.1847764444934648, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158867329.96923077, "logits/rejected": 16219639.873015873, "logps/chosen": -219.3230769230769, "logps/rejected": -304.0, "loss": 0.1473, "rewards/chosen": 1.8346153846153845, "rewards/margins": 10.525091575091574, "rewards/rejected": -8.69047619047619, "step": 924 }, { "epoch": 0.6342132327733974, "grad_norm": 0.1761230202307175, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138632784.84210527, "logits/rejected": 82468287.09859155, "logps/chosen": -192.56140350877192, "logps/rejected": -344.3380281690141, "loss": 0.1639, "rewards/chosen": 1.2138157894736843, "rewards/margins": 8.594097479614529, "rewards/rejected": -7.380281690140845, "step": 925 }, { "epoch": 0.6348988687007199, "grad_norm": 0.23762614349855057, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152930776.6153846, "logits/rejected": 80756996.06349206, "logps/chosen": -278.15384615384613, "logps/rejected": -371.3015873015873, "loss": 0.1618, "rewards/chosen": 1.5788461538461538, "rewards/margins": 7.586782661782662, "rewards/rejected": -6.007936507936508, "step": 926 }, { "epoch": 0.6355845046280425, "grad_norm": 0.17568800252211278, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101403467.29411764, "logits/rejected": 74857432.10389611, "logps/chosen": -229.64705882352942, "logps/rejected": -301.2987012987013, "loss": 0.1457, "rewards/chosen": 1.8259803921568627, "rewards/margins": 8.35195441813089, "rewards/rejected": -6.525974025974026, "step": 927 }, { "epoch": 0.6362701405553651, "grad_norm": 0.16488756520373915, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73019019.63636364, "logits/rejected": 137803181.41935483, "logps/chosen": -178.66666666666666, "logps/rejected": -412.38709677419354, "loss": 0.1865, "rewards/chosen": 0.9289772727272727, "rewards/margins": 8.7354288856305, "rewards/rejected": -7.806451612903226, "step": 928 }, { "epoch": 0.6369557764826876, "grad_norm": 0.17999859326963386, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134733421.1147541, "logits/rejected": 75497472.0, "logps/chosen": -243.14754098360655, "logps/rejected": -415.5223880597015, "loss": 0.1558, "rewards/chosen": 1.6127049180327868, "rewards/margins": 9.373898947883534, "rewards/rejected": -7.7611940298507465, "step": 929 }, { "epoch": 0.6376414124100103, "grad_norm": 0.1929765851084228, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130351104.0, "logits/rejected": 167772160.0, "logps/chosen": -246.5, "logps/rejected": -443.5, "loss": 0.1763, "rewards/chosen": 1.677734375, "rewards/margins": 9.755859375, "rewards/rejected": -8.078125, "step": 930 }, { "epoch": 0.6383270483373329, "grad_norm": 0.17909829296307306, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144670199.87301588, "logits/rejected": 82498733.29230769, "logps/chosen": -258.53968253968253, "logps/rejected": -389.4153846153846, "loss": 0.1906, "rewards/chosen": 0.6517857142857143, "rewards/margins": 8.782554945054944, "rewards/rejected": -8.13076923076923, "step": 931 }, { "epoch": 0.6390126842646555, "grad_norm": 0.18990041633455215, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128943072.96969697, "logits/rejected": 72520869.16129032, "logps/chosen": -237.33333333333334, "logps/rejected": -381.6774193548387, "loss": 0.1903, "rewards/chosen": 1.2206439393939394, "rewards/margins": 9.454514907135875, "rewards/rejected": -8.233870967741936, "step": 932 }, { "epoch": 0.639698320191978, "grad_norm": 0.223864908097593, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110953321.81333333, "logits/rejected": 92670377.05660377, "logps/chosen": -235.30666666666667, "logps/rejected": -323.62264150943395, "loss": 0.1845, "rewards/chosen": 1.53, "rewards/margins": 10.44509433962264, "rewards/rejected": -8.915094339622641, "step": 933 }, { "epoch": 0.6403839561193007, "grad_norm": 0.17047125300771163, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100532224.0, "logits/rejected": 102662144.0, "logps/chosen": -186.75, "logps/rejected": -334.5, "loss": 0.1532, "rewards/chosen": 1.224609375, "rewards/margins": 7.302734375, "rewards/rejected": -6.078125, "step": 934 }, { "epoch": 0.6410695920466233, "grad_norm": 0.18957616525426224, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123012182.77966101, "logits/rejected": 73947403.13043478, "logps/chosen": -177.89830508474577, "logps/rejected": -351.07246376811594, "loss": 0.1801, "rewards/chosen": 1.1069915254237288, "rewards/margins": 8.686701670351265, "rewards/rejected": -7.579710144927536, "step": 935 }, { "epoch": 0.6417552279739458, "grad_norm": 0.16467970972352228, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 78586004.94545455, "logits/rejected": 156051091.28767124, "logps/chosen": -255.12727272727273, "logps/rejected": -422.13698630136986, "loss": 0.1335, "rewards/chosen": 1.8227272727272728, "rewards/margins": 9.740535491905355, "rewards/rejected": -7.917808219178082, "step": 936 }, { "epoch": 0.6424408639012684, "grad_norm": 0.16309446460022398, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116081246.81481482, "logits/rejected": 63538037.62162162, "logps/chosen": -212.14814814814815, "logps/rejected": -374.9189189189189, "loss": 0.1471, "rewards/chosen": 1.5486111111111112, "rewards/margins": 7.69725975975976, "rewards/rejected": -6.148648648648648, "step": 937 }, { "epoch": 0.643126499828591, "grad_norm": 0.16824142122284627, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109772800.0, "logits/rejected": 114950144.0, "logps/chosen": -270.75, "logps/rejected": -391.0, "loss": 0.1731, "rewards/chosen": 1.18115234375, "rewards/margins": 9.93896484375, "rewards/rejected": -8.7578125, "step": 938 }, { "epoch": 0.6438121357559136, "grad_norm": 0.16762951873372564, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84267380.36363636, "logits/rejected": 75870937.42465754, "logps/chosen": -176.72727272727272, "logps/rejected": -327.67123287671234, "loss": 0.1404, "rewards/chosen": 1.5875, "rewards/margins": 9.128595890410958, "rewards/rejected": -7.541095890410959, "step": 939 }, { "epoch": 0.6444977716832362, "grad_norm": 0.18052132241834604, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96708666.51428571, "logits/rejected": 77847728.55172414, "logps/chosen": -211.88571428571427, "logps/rejected": -321.37931034482756, "loss": 0.198, "rewards/chosen": 0.8803571428571428, "rewards/margins": 8.302770935960591, "rewards/rejected": -7.422413793103448, "step": 940 }, { "epoch": 0.6451834076105588, "grad_norm": 0.17045354256139159, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114810185.76271187, "logits/rejected": 114644309.33333333, "logps/chosen": -216.94915254237287, "logps/rejected": -381.6811594202899, "loss": 0.1635, "rewards/chosen": 1.458686440677966, "rewards/margins": 9.074628469663473, "rewards/rejected": -7.615942028985507, "step": 941 }, { "epoch": 0.6458690435378814, "grad_norm": 0.23371662191038145, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74798421.33333333, "logits/rejected": 128672715.9322034, "logps/chosen": -186.20289855072463, "logps/rejected": -438.77966101694915, "loss": 0.1864, "rewards/chosen": 1.2998188405797102, "rewards/margins": 8.198123925325474, "rewards/rejected": -6.898305084745763, "step": 942 }, { "epoch": 0.646554679465204, "grad_norm": 0.2560010588854247, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113063846.95652173, "logits/rejected": 98032969.76271187, "logps/chosen": -284.28985507246375, "logps/rejected": -439.864406779661, "loss": 0.2126, "rewards/chosen": 1.0380434782608696, "rewards/margins": 8.326179071481208, "rewards/rejected": -7.288135593220339, "step": 943 }, { "epoch": 0.6472403153925266, "grad_norm": 0.21981234163689006, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127499074.37037037, "logits/rejected": 102023610.8108108, "logps/chosen": -245.03703703703704, "logps/rejected": -430.7027027027027, "loss": 0.1717, "rewards/chosen": 0.9322916666666666, "rewards/margins": 9.148507882882882, "rewards/rejected": -8.216216216216216, "step": 944 }, { "epoch": 0.6479259513198492, "grad_norm": 0.2433108836801191, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120333135.44827586, "logits/rejected": 78733077.94285715, "logps/chosen": -227.44827586206895, "logps/rejected": -371.65714285714284, "loss": 0.1585, "rewards/chosen": 1.5398706896551724, "rewards/margins": 10.639870689655172, "rewards/rejected": -9.1, "step": 945 }, { "epoch": 0.6486115872471717, "grad_norm": 0.17316300428711248, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70107136.0, "logits/rejected": 116916224.0, "logps/chosen": -195.625, "logps/rejected": -355.5, "loss": 0.1913, "rewards/chosen": 0.9677734375, "rewards/margins": 7.6396484375, "rewards/rejected": -6.671875, "step": 946 }, { "epoch": 0.6492972231744943, "grad_norm": 0.157894273780824, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 182951545.9047619, "logits/rejected": 74723138.95384616, "logps/chosen": -295.1111111111111, "logps/rejected": -319.5076923076923, "loss": 0.1515, "rewards/chosen": 1.1746031746031746, "rewards/margins": 7.328449328449329, "rewards/rejected": -6.153846153846154, "step": 947 }, { "epoch": 0.649982859101817, "grad_norm": 0.18977914752582759, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73079921.77777778, "logits/rejected": 136389778.2857143, "logps/chosen": -237.11111111111111, "logps/rejected": -340.7142857142857, "loss": 0.1974, "rewards/chosen": 1.3671875, "rewards/margins": 8.956473214285715, "rewards/rejected": -7.589285714285714, "step": 948 }, { "epoch": 0.6506684950291395, "grad_norm": 0.21796202945120097, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126682198.77966101, "logits/rejected": 106012553.27536231, "logps/chosen": -252.74576271186442, "logps/rejected": -370.0869565217391, "loss": 0.1829, "rewards/chosen": 1.0582627118644068, "rewards/margins": 8.174204740849914, "rewards/rejected": -7.115942028985507, "step": 949 }, { "epoch": 0.6513541309564621, "grad_norm": 0.2223730691475535, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90451077.56521739, "logits/rejected": 142321976.40677965, "logps/chosen": -228.8695652173913, "logps/rejected": -407.864406779661, "loss": 0.1874, "rewards/chosen": 1.3478260869565217, "rewards/margins": 9.093588798820928, "rewards/rejected": -7.745762711864407, "step": 950 }, { "epoch": 0.6520397668837847, "grad_norm": 0.16023712025271852, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130786024.72727273, "logits/rejected": 64808761.80645161, "logps/chosen": -287.5151515151515, "logps/rejected": -375.741935483871, "loss": 0.1713, "rewards/chosen": 2.121212121212121, "rewards/margins": 10.105083088954057, "rewards/rejected": -7.983870967741935, "step": 951 }, { "epoch": 0.6527254028111072, "grad_norm": 0.181538501087882, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128746896.69565217, "logits/rejected": 94976104.13559322, "logps/chosen": -275.94202898550725, "logps/rejected": -391.0508474576271, "loss": 0.1659, "rewards/chosen": 2.13768115942029, "rewards/margins": 10.629206583149102, "rewards/rejected": -8.491525423728813, "step": 952 }, { "epoch": 0.6534110387384299, "grad_norm": 0.21866254836925111, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94971026.28571428, "logits/rejected": 97047516.68965517, "logps/chosen": -221.94285714285715, "logps/rejected": -334.62068965517244, "loss": 0.1922, "rewards/chosen": 1.5392857142857144, "rewards/margins": 7.582389162561577, "rewards/rejected": -6.043103448275862, "step": 953 }, { "epoch": 0.6540966746657525, "grad_norm": 0.16087763064598545, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112233176.94915254, "logits/rejected": 109477413.10144928, "logps/chosen": -208.0, "logps/rejected": -375.6521739130435, "loss": 0.1513, "rewards/chosen": 1.5741525423728813, "rewards/margins": 9.842268484401867, "rewards/rejected": -8.268115942028986, "step": 954 }, { "epoch": 0.6547823105930751, "grad_norm": 0.19083453825799188, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145449931.93220338, "logits/rejected": 89539272.3478261, "logps/chosen": -293.4237288135593, "logps/rejected": -362.2028985507246, "loss": 0.1451, "rewards/chosen": 1.9872881355932204, "rewards/margins": 10.103230164578727, "rewards/rejected": -8.115942028985508, "step": 955 }, { "epoch": 0.6554679465203976, "grad_norm": 0.17876310721809965, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115867648.0, "logits/rejected": 63537152.0, "logps/chosen": -257.625, "logps/rejected": -338.0, "loss": 0.1897, "rewards/chosen": 1.4443359375, "rewards/margins": 6.7490234375, "rewards/rejected": -5.3046875, "step": 956 }, { "epoch": 0.6561535824477203, "grad_norm": 0.15183682195835338, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143645549.7142857, "logits/rejected": 68259384.8888889, "logps/chosen": -279.42857142857144, "logps/rejected": -328.44444444444446, "loss": 0.172, "rewards/chosen": 1.7410714285714286, "rewards/margins": 9.97718253968254, "rewards/rejected": -8.23611111111111, "step": 957 }, { "epoch": 0.6568392183750429, "grad_norm": 0.202036343128546, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146031684.26666668, "logits/rejected": 107941647.05882353, "logps/chosen": -249.33333333333334, "logps/rejected": -408.0, "loss": 0.1737, "rewards/chosen": 1.20625, "rewards/margins": 9.838602941176472, "rewards/rejected": -8.632352941176471, "step": 958 }, { "epoch": 0.6575248543023654, "grad_norm": 0.16511237197030723, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76266427.73333333, "logits/rejected": 97825972.70588236, "logps/chosen": -180.26666666666668, "logps/rejected": -334.5882352941176, "loss": 0.1707, "rewards/chosen": 1.328125, "rewards/margins": 9.254595588235293, "rewards/rejected": -7.926470588235294, "step": 959 }, { "epoch": 0.658210490229688, "grad_norm": 0.19465325550231408, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115025609.6969697, "logits/rejected": 89636335.48387097, "logps/chosen": -204.12121212121212, "logps/rejected": -422.19354838709677, "loss": 0.1981, "rewards/chosen": 1.5246212121212122, "rewards/margins": 6.887524437927664, "rewards/rejected": -5.362903225806452, "step": 960 }, { "epoch": 0.6588961261570107, "grad_norm": 0.14957441462962504, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104168982.92537314, "logits/rejected": 110908399.21311475, "logps/chosen": -196.53731343283582, "logps/rejected": -421.5081967213115, "loss": 0.1731, "rewards/chosen": 1.5867537313432836, "rewards/margins": 10.086753731343283, "rewards/rejected": -8.5, "step": 961 }, { "epoch": 0.6595817620843332, "grad_norm": 0.17496006113434484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147324928.0, "logits/rejected": 73990144.0, "logps/chosen": -231.0, "logps/rejected": -340.0, "loss": 0.1722, "rewards/chosen": 1.744140625, "rewards/margins": 7.931640625, "rewards/rejected": -6.1875, "step": 962 }, { "epoch": 0.6602673980116558, "grad_norm": 0.20136807564593034, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153524841.65079364, "logits/rejected": 102502336.98461539, "logps/chosen": -267.1746031746032, "logps/rejected": -352.4923076923077, "loss": 0.1642, "rewards/chosen": 1.443452380952381, "rewards/margins": 8.974221611721612, "rewards/rejected": -7.530769230769231, "step": 963 }, { "epoch": 0.6609530339389784, "grad_norm": 0.15556781426229568, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114444580.57142857, "logits/rejected": 112780174.22222222, "logps/chosen": -264.2857142857143, "logps/rejected": -419.55555555555554, "loss": 0.154, "rewards/chosen": 1.5803571428571428, "rewards/margins": 9.149801587301587, "rewards/rejected": -7.569444444444445, "step": 964 }, { "epoch": 0.661638669866301, "grad_norm": 0.19344937739496612, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104018739.2, "logits/rejected": 98199974.6031746, "logps/chosen": -180.30769230769232, "logps/rejected": -387.04761904761904, "loss": 0.2013, "rewards/chosen": 0.9115384615384615, "rewards/margins": 8.308363858363858, "rewards/rejected": -7.396825396825397, "step": 965 }, { "epoch": 0.6623243057936236, "grad_norm": 0.21702181477825033, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134217728.0, "logits/rejected": 82736028.90322581, "logps/chosen": -295.27272727272725, "logps/rejected": -328.0, "loss": 0.2028, "rewards/chosen": 1.678030303030303, "rewards/margins": 7.653836754643206, "rewards/rejected": -5.975806451612903, "step": 966 }, { "epoch": 0.6630099417209462, "grad_norm": 0.17326139929398274, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114988457.35384615, "logits/rejected": 121435087.23809524, "logps/chosen": -196.6769230769231, "logps/rejected": -449.015873015873, "loss": 0.1705, "rewards/chosen": 1.3134615384615385, "rewards/margins": 9.511874236874236, "rewards/rejected": -8.198412698412698, "step": 967 }, { "epoch": 0.6636955776482688, "grad_norm": 0.20241123692191204, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92154850.74285714, "logits/rejected": 102037292.13793103, "logps/chosen": -206.62857142857143, "logps/rejected": -387.58620689655174, "loss": 0.1838, "rewards/chosen": 1.3678571428571429, "rewards/margins": -11697344.839039408, "rewards/rejected": 11697346.206896551, "step": 968 }, { "epoch": 0.6643812135755913, "grad_norm": 0.1963316925907088, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82871329.03225806, "logits/rejected": 66727563.63636363, "logps/chosen": -210.58064516129033, "logps/rejected": -323.8787878787879, "loss": 0.1745, "rewards/chosen": 1.5846774193548387, "rewards/margins": 8.67558651026393, "rewards/rejected": -7.090909090909091, "step": 969 }, { "epoch": 0.665066849502914, "grad_norm": 0.17595736925012836, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160781653.33333334, "logits/rejected": 77903028.70588236, "logps/chosen": -238.4, "logps/rejected": -370.3529411764706, "loss": 0.1714, "rewards/chosen": 1.5729166666666667, "rewards/margins": 8.837622549019608, "rewards/rejected": -7.264705882352941, "step": 970 }, { "epoch": 0.6657524854302366, "grad_norm": 0.23746687444447456, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121330880.92753623, "logits/rejected": 76777090.16949153, "logps/chosen": -243.47826086956522, "logps/rejected": -339.52542372881356, "loss": 0.1782, "rewards/chosen": 1.0090579710144927, "rewards/margins": 8.254820682878899, "rewards/rejected": -7.245762711864407, "step": 971 }, { "epoch": 0.6664381213575591, "grad_norm": 0.20235374553900543, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122128263.52941176, "logits/rejected": 121075575.46666667, "logps/chosen": -241.88235294117646, "logps/rejected": -370.6666666666667, "loss": 0.1928, "rewards/chosen": 1.4255514705882353, "rewards/margins": 6.875551470588236, "rewards/rejected": -5.45, "step": 972 }, { "epoch": 0.6671237572848817, "grad_norm": 0.15616609064650097, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86815435.17460318, "logits/rejected": 66399058.70769231, "logps/chosen": -240.5079365079365, "logps/rejected": -332.3076923076923, "loss": 0.17, "rewards/chosen": 1.8492063492063493, "rewards/margins": 9.287667887667888, "rewards/rejected": -7.438461538461539, "step": 973 }, { "epoch": 0.6678093932122043, "grad_norm": 0.1789713032218537, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109121809.06666666, "logits/rejected": 95728820.70588236, "logps/chosen": -271.46666666666664, "logps/rejected": -303.52941176470586, "loss": 0.1765, "rewards/chosen": 1.6572916666666666, "rewards/margins": 8.671997549019608, "rewards/rejected": -7.014705882352941, "step": 974 }, { "epoch": 0.668495029139527, "grad_norm": 0.17184304292732644, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106918594.20689656, "logits/rejected": 77654542.62857144, "logps/chosen": -224.82758620689654, "logps/rejected": -364.8, "loss": 0.15, "rewards/chosen": 1.9913793103448276, "rewards/margins": 9.877093596059114, "rewards/rejected": -7.885714285714286, "step": 975 }, { "epoch": 0.6691806650668495, "grad_norm": 0.20222526833423923, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138412032.0, "logits/rejected": 82125970.28571428, "logps/chosen": -277.55555555555554, "logps/rejected": -403.42857142857144, "loss": 0.1948, "rewards/chosen": 1.7309027777777777, "rewards/margins": 8.900545634920634, "rewards/rejected": -7.169642857142857, "step": 976 }, { "epoch": 0.6698663009941721, "grad_norm": 0.16504186108303134, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133724280.47058824, "logits/rejected": 105129957.4025974, "logps/chosen": -273.5686274509804, "logps/rejected": -334.12987012987014, "loss": 0.1334, "rewards/chosen": 1.5183823529411764, "rewards/margins": 9.076823911382734, "rewards/rejected": -7.558441558441558, "step": 977 }, { "epoch": 0.6705519369214947, "grad_norm": 0.1901723716791215, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115146752.0, "logits/rejected": 102498304.0, "logps/chosen": -231.75, "logps/rejected": -368.0, "loss": 0.1589, "rewards/chosen": 0.958984375, "rewards/margins": 9.568359375, "rewards/rejected": -8.609375, "step": 978 }, { "epoch": 0.6712375728488172, "grad_norm": 0.18714781571422634, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158415635.69230768, "logits/rejected": 36417210.92063492, "logps/chosen": -231.13846153846154, "logps/rejected": -317.968253968254, "loss": 0.1787, "rewards/chosen": 1.323076923076923, "rewards/margins": 9.243711843711845, "rewards/rejected": -7.920634920634921, "step": 979 }, { "epoch": 0.6719232087761399, "grad_norm": 0.17240799087357234, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82463012.57142857, "logits/rejected": 123081127.72413793, "logps/chosen": -191.0857142857143, "logps/rejected": -408.82758620689657, "loss": 0.1883, "rewards/chosen": 1.3428571428571427, "rewards/margins": 8.33423645320197, "rewards/rejected": -6.991379310344827, "step": 980 }, { "epoch": 0.6726088447034625, "grad_norm": 0.1626272966106778, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119754610.7586207, "logits/rejected": 115523115.88571429, "logps/chosen": -196.82758620689654, "logps/rejected": -377.14285714285717, "loss": 0.1678, "rewards/chosen": 0.994073275862069, "rewards/margins": 7.508358990147784, "rewards/rejected": -6.514285714285714, "step": 981 }, { "epoch": 0.673294480630785, "grad_norm": 0.1993894769404663, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 174741157.41538462, "logits/rejected": 85217605.07936507, "logps/chosen": -285.04615384615386, "logps/rejected": -371.3015873015873, "loss": 0.1771, "rewards/chosen": 1.773076923076923, "rewards/margins": 9.376251526251526, "rewards/rejected": -7.603174603174603, "step": 982 }, { "epoch": 0.6739801165581076, "grad_norm": 0.17045828044691258, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136540727.13846153, "logits/rejected": 94338551.87301587, "logps/chosen": -202.33846153846153, "logps/rejected": -322.7936507936508, "loss": 0.2162, "rewards/chosen": 0.7099759615384615, "rewards/margins": 7.876642628205128, "rewards/rejected": -7.166666666666667, "step": 983 }, { "epoch": 0.6746657524854303, "grad_norm": 0.17473943546471954, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 60218221.71428572, "logits/rejected": 95456573.79310344, "logps/chosen": -219.42857142857142, "logps/rejected": -287.7241379310345, "loss": 0.1727, "rewards/chosen": 1.8821428571428571, "rewards/margins": 10.218349753694582, "rewards/rejected": -8.336206896551724, "step": 984 }, { "epoch": 0.6753513884127528, "grad_norm": 0.1822173241115448, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97988667.36231884, "logits/rejected": 73080415.45762712, "logps/chosen": -229.79710144927537, "logps/rejected": -366.64406779661016, "loss": 0.1802, "rewards/chosen": 1.733695652173913, "rewards/margins": 9.148949889462049, "rewards/rejected": -7.415254237288136, "step": 985 }, { "epoch": 0.6760370243400754, "grad_norm": 0.21228405232230538, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122037009.53424658, "logits/rejected": 72218288.87272727, "logps/chosen": -230.7945205479452, "logps/rejected": -364.2181818181818, "loss": 0.161, "rewards/chosen": 1.7636986301369864, "rewards/margins": 9.945516811955168, "rewards/rejected": -8.181818181818182, "step": 986 }, { "epoch": 0.676722660267398, "grad_norm": 0.22987746785278987, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128036648.42105263, "logits/rejected": 85418614.15384616, "logps/chosen": -236.8421052631579, "logps/rejected": -348.3076923076923, "loss": 0.2181, "rewards/chosen": 1.064967105263158, "rewards/margins": 6.363044028340081, "rewards/rejected": -5.298076923076923, "step": 987 }, { "epoch": 0.6774082961947206, "grad_norm": 0.21671306382505828, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 205023266.7118644, "logits/rejected": 21959308.985507246, "logps/chosen": -372.6101694915254, "logps/rejected": -359.6521739130435, "loss": 0.1527, "rewards/chosen": 2.455508474576271, "rewards/margins": 8.904783836895112, "rewards/rejected": -6.449275362318841, "step": 988 }, { "epoch": 0.6780939321220432, "grad_norm": 0.1776987620616038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115527320.70175439, "logits/rejected": 144733025.35211268, "logps/chosen": -239.1578947368421, "logps/rejected": -422.76056338028167, "loss": 0.1696, "rewards/chosen": 1.2214912280701755, "rewards/margins": 9.679237706943415, "rewards/rejected": -8.45774647887324, "step": 989 }, { "epoch": 0.6787795680493658, "grad_norm": 0.2248123588450896, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121602048.0, "logits/rejected": 128778240.0, "logps/chosen": -240.5, "logps/rejected": -421.5, "loss": 0.1591, "rewards/chosen": 2.15234375, "rewards/margins": 10.68359375, "rewards/rejected": -8.53125, "step": 990 }, { "epoch": 0.6794652039766884, "grad_norm": 0.16070467557590803, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97255424.0, "logits/rejected": 82051072.0, "logps/chosen": -236.57142857142858, "logps/rejected": -387.1111111111111, "loss": 0.1414, "rewards/chosen": 1.21875, "rewards/margins": 10.065972222222221, "rewards/rejected": -8.847222222222221, "step": 991 }, { "epoch": 0.6801508399040109, "grad_norm": 0.20015742918917523, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84925457.96491228, "logits/rejected": 103114896.22535211, "logps/chosen": -202.52631578947367, "logps/rejected": -331.2676056338028, "loss": 0.1486, "rewards/chosen": 1.0296052631578947, "rewards/margins": 9.283126389918458, "rewards/rejected": -8.253521126760564, "step": 992 }, { "epoch": 0.6808364758313336, "grad_norm": 0.18362522207039073, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 72292390.64150943, "logits/rejected": 101446232.74666667, "logps/chosen": -189.88679245283018, "logps/rejected": -410.88, "loss": 0.1399, "rewards/chosen": 1.5330188679245282, "rewards/margins": -20664398.680314466, "rewards/rejected": 20664400.213333335, "step": 993 }, { "epoch": 0.6815221117586562, "grad_norm": 0.17059649424742152, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154426647.27272728, "logits/rejected": 68969240.77419356, "logps/chosen": -281.45454545454544, "logps/rejected": -365.4193548387097, "loss": 0.1814, "rewards/chosen": 1.5606060606060606, "rewards/margins": 5.6170576735092865, "rewards/rejected": -4.056451612903226, "step": 994 }, { "epoch": 0.6822077476859787, "grad_norm": 0.1687075668579024, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 180748288.0, "logits/rejected": 35373056.0, "logps/chosen": -256.0, "logps/rejected": -318.5, "loss": 0.1784, "rewards/chosen": 1.375, "rewards/margins": 9.390625, "rewards/rejected": -8.015625, "step": 995 }, { "epoch": 0.6828933836133013, "grad_norm": 0.23124919009204603, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127172608.0, "logits/rejected": 86704128.0, "logps/chosen": -243.875, "logps/rejected": -379.0, "loss": 0.1823, "rewards/chosen": 1.744140625, "rewards/margins": 9.072265625, "rewards/rejected": -7.328125, "step": 996 }, { "epoch": 0.6835790195406239, "grad_norm": 0.19572818421231925, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144830588.12121212, "logits/rejected": 78879975.22580644, "logps/chosen": -219.15151515151516, "logps/rejected": -380.9032258064516, "loss": 0.1588, "rewards/chosen": 1.7746212121212122, "rewards/margins": 9.968169599217987, "rewards/rejected": -8.193548387096774, "step": 997 }, { "epoch": 0.6842646554679466, "grad_norm": 0.1718926002080309, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86872932.84848484, "logits/rejected": 110946105.80645162, "logps/chosen": -221.57575757575756, "logps/rejected": -404.1290322580645, "loss": 0.1434, "rewards/chosen": 1.5056818181818181, "rewards/margins": 10.36858504398827, "rewards/rejected": -8.862903225806452, "step": 998 }, { "epoch": 0.6849502913952691, "grad_norm": 0.23899768658481224, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135594962.14925373, "logits/rejected": 40292822.03278688, "logps/chosen": -284.4179104477612, "logps/rejected": -361.9672131147541, "loss": 0.1986, "rewards/chosen": 1.5009328358208955, "rewards/margins": 10.017326278443846, "rewards/rejected": -8.51639344262295, "step": 999 }, { "epoch": 0.6856359273225917, "grad_norm": 0.19890004034593087, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81199104.0, "logits/rejected": 106299392.0, "logps/chosen": -190.25, "logps/rejected": -378.0, "loss": 0.1563, "rewards/chosen": 1.517578125, "rewards/margins": 9.525390625, "rewards/rejected": -8.0078125, "step": 1000 }, { "epoch": 0.6863215632499143, "grad_norm": 0.17105601044696667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119668736.0, "logits/rejected": 97910784.0, "logps/chosen": -260.75, "logps/rejected": -397.0, "loss": 0.1705, "rewards/chosen": 1.814453125, "rewards/margins": 9.564453125, "rewards/rejected": -7.75, "step": 1001 }, { "epoch": 0.6870071991772368, "grad_norm": 0.1502253385814394, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112181981.6119403, "logits/rejected": 121291020.59016393, "logps/chosen": -272.0, "logps/rejected": -359.344262295082, "loss": 0.1688, "rewards/chosen": 1.7434701492537314, "rewards/margins": 8.350027526302911, "rewards/rejected": -6.60655737704918, "step": 1002 }, { "epoch": 0.6876928351045595, "grad_norm": 0.25519202565856436, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122517827.36842105, "logits/rejected": 39119950.76923077, "logps/chosen": -228.21052631578948, "logps/rejected": -296.61538461538464, "loss": 0.2023, "rewards/chosen": 1.2952302631578947, "rewards/margins": 6.91061487854251, "rewards/rejected": -5.615384615384615, "step": 1003 }, { "epoch": 0.6883784710318821, "grad_norm": 0.20503302841599394, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118489088.0, "logits/rejected": 42688512.0, "logps/chosen": -225.5, "logps/rejected": -366.25, "loss": 0.1895, "rewards/chosen": 1.697265625, "rewards/margins": 10.064453125, "rewards/rejected": -8.3671875, "step": 1004 }, { "epoch": 0.6890641069592046, "grad_norm": 0.1804224215788616, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130935229.2173913, "logits/rejected": 123483153.3559322, "logps/chosen": -279.6521739130435, "logps/rejected": -406.23728813559325, "loss": 0.1842, "rewards/chosen": 1.786231884057971, "rewards/margins": 10.082842053549497, "rewards/rejected": -8.296610169491526, "step": 1005 }, { "epoch": 0.6897497428865272, "grad_norm": 0.2751157078052932, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139969916.34285715, "logits/rejected": 70254592.0, "logps/chosen": -256.0, "logps/rejected": -371.86206896551727, "loss": 0.191, "rewards/chosen": 1.4848214285714285, "rewards/margins": 8.467580049261084, "rewards/rejected": -6.982758620689655, "step": 1006 }, { "epoch": 0.6904353788138499, "grad_norm": 0.1664415366473156, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98705954.13333334, "logits/rejected": 71750354.8235294, "logps/chosen": -224.26666666666668, "logps/rejected": -376.0, "loss": 0.1624, "rewards/chosen": 0.9520833333333333, "rewards/margins": 8.775612745098039, "rewards/rejected": -7.823529411764706, "step": 1007 }, { "epoch": 0.6911210147411725, "grad_norm": 0.18490601122416897, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130930300.54054055, "logits/rejected": 66739920.59259259, "logps/chosen": -203.8918918918919, "logps/rejected": -311.1111111111111, "loss": 0.2006, "rewards/chosen": 1.4797297297297298, "rewards/margins": 6.877877877877878, "rewards/rejected": -5.398148148148148, "step": 1008 }, { "epoch": 0.691806650668495, "grad_norm": 0.15594412012289605, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68050805.15254237, "logits/rejected": 144429946.4347826, "logps/chosen": -248.40677966101694, "logps/rejected": -395.82608695652175, "loss": 0.12, "rewards/chosen": 1.930084745762712, "rewards/margins": 11.806896339965611, "rewards/rejected": -9.876811594202898, "step": 1009 }, { "epoch": 0.6924922865958176, "grad_norm": 0.1836981985081094, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152285499.07692307, "logits/rejected": 58087781.58730159, "logps/chosen": -223.5076923076923, "logps/rejected": -329.3968253968254, "loss": 0.1679, "rewards/chosen": 1.4442307692307692, "rewards/margins": 9.499786324786324, "rewards/rejected": -8.055555555555555, "step": 1010 }, { "epoch": 0.6931779225231403, "grad_norm": 0.20752236979940442, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158544691.2, "logits/rejected": 56912366.344827585, "logps/chosen": -265.8285714285714, "logps/rejected": -345.9310344827586, "loss": 0.1802, "rewards/chosen": 1.24375, "rewards/margins": 7.717887931034483, "rewards/rejected": -6.474137931034483, "step": 1011 }, { "epoch": 0.6938635584504628, "grad_norm": 0.16658796379291657, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143871284.82539684, "logits/rejected": 77691415.63076924, "logps/chosen": -229.5873015873016, "logps/rejected": -388.18461538461537, "loss": 0.1729, "rewards/chosen": 1.6111111111111112, "rewards/margins": 7.688034188034187, "rewards/rejected": -6.076923076923077, "step": 1012 }, { "epoch": 0.6945491943777854, "grad_norm": 0.23455567088032617, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110143572.16438356, "logits/rejected": 69434796.21818182, "logps/chosen": -236.4931506849315, "logps/rejected": -425.0181818181818, "loss": 0.2029, "rewards/chosen": 1.341609589041096, "rewards/margins": 8.177973225404731, "rewards/rejected": -6.836363636363636, "step": 1013 }, { "epoch": 0.695234830305108, "grad_norm": 0.24058902322444706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111794013.46031746, "logits/rejected": 121570288.24615385, "logps/chosen": -246.984126984127, "logps/rejected": -354.46153846153845, "loss": 0.1766, "rewards/chosen": 1.1944444444444444, "rewards/margins": 9.632905982905983, "rewards/rejected": -8.438461538461539, "step": 1014 }, { "epoch": 0.6959204662324305, "grad_norm": 0.1811592464283318, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146729550.1016949, "logits/rejected": 84038047.53623189, "logps/chosen": -257.6271186440678, "logps/rejected": -373.3333333333333, "loss": 0.1381, "rewards/chosen": 1.5296610169491525, "rewards/margins": 9.05864652419553, "rewards/rejected": -7.528985507246377, "step": 1015 }, { "epoch": 0.6966061021597532, "grad_norm": 0.1507977922406891, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115713445.64705883, "logits/rejected": 136175069.86666667, "logps/chosen": -238.8235294117647, "logps/rejected": -398.4, "loss": 0.1328, "rewards/chosen": 2.1488970588235294, "rewards/margins": 11.065563725490195, "rewards/rejected": -8.916666666666666, "step": 1016 }, { "epoch": 0.6972917380870758, "grad_norm": 0.17153777988008692, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135476019.2, "logits/rejected": 48481219.76470588, "logps/chosen": -232.0, "logps/rejected": -335.7647058823529, "loss": 0.1527, "rewards/chosen": 1.8604166666666666, "rewards/margins": 8.77218137254902, "rewards/rejected": -6.911764705882353, "step": 1017 }, { "epoch": 0.6979773740143983, "grad_norm": 0.2034699432287474, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128172995.76470588, "logits/rejected": 82557883.73333333, "logps/chosen": -226.58823529411765, "logps/rejected": -418.6666666666667, "loss": 0.187, "rewards/chosen": 1.1309742647058822, "rewards/margins": 10.347640931372549, "rewards/rejected": -9.216666666666667, "step": 1018 }, { "epoch": 0.6986630099417209, "grad_norm": 0.23382453800915465, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126585469.90163934, "logits/rejected": 115625066.98507462, "logps/chosen": -237.63934426229508, "logps/rejected": -420.2985074626866, "loss": 0.1581, "rewards/chosen": 0.9559426229508197, "rewards/margins": 9.261912772204552, "rewards/rejected": -8.305970149253731, "step": 1019 }, { "epoch": 0.6993486458690436, "grad_norm": 0.19290099557708412, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110958405.81818181, "logits/rejected": 64918693.161290325, "logps/chosen": -224.0, "logps/rejected": -371.0967741935484, "loss": 0.1813, "rewards/chosen": 1.3229166666666667, "rewards/margins": 7.451948924731183, "rewards/rejected": -6.129032258064516, "step": 1020 }, { "epoch": 0.7000342817963662, "grad_norm": 0.16683096603780057, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77437337.6, "logits/rejected": 115343360.0, "logps/chosen": -202.4, "logps/rejected": -365.6470588235294, "loss": 0.1521, "rewards/chosen": 1.70625, "rewards/margins": 10.375367647058825, "rewards/rejected": -8.669117647058824, "step": 1021 }, { "epoch": 0.7007199177236887, "grad_norm": 0.18740900201263472, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110038799.05882353, "logits/rejected": 113525828.26666667, "logps/chosen": -265.88235294117646, "logps/rejected": -384.0, "loss": 0.1801, "rewards/chosen": 1.5399816176470589, "rewards/margins": 10.323314950980393, "rewards/rejected": -8.783333333333333, "step": 1022 }, { "epoch": 0.7014055536510113, "grad_norm": 0.15648978869201602, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102511633.3559322, "logits/rejected": 110936301.44927536, "logps/chosen": -179.25423728813558, "logps/rejected": -387.2463768115942, "loss": 0.1631, "rewards/chosen": 0.9216101694915254, "rewards/margins": 7.747697126013264, "rewards/rejected": -6.826086956521739, "step": 1023 }, { "epoch": 0.7020911895783339, "grad_norm": 0.17736208418950952, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139051841.08474576, "logits/rejected": 79296660.4057971, "logps/chosen": -244.0677966101695, "logps/rejected": -363.1304347826087, "loss": 0.175, "rewards/chosen": 1.3411016949152543, "rewards/margins": 9.688927781871776, "rewards/rejected": -8.347826086956522, "step": 1024 }, { "epoch": 0.7027768255056565, "grad_norm": 0.1905392566625864, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150994944.0, "logits/rejected": 80605535.08571428, "logps/chosen": -282.7586206896552, "logps/rejected": -440.22857142857146, "loss": 0.1586, "rewards/chosen": 1.5727370689655173, "rewards/margins": 7.787022783251231, "rewards/rejected": -6.214285714285714, "step": 1025 }, { "epoch": 0.7034624614329791, "grad_norm": 0.22229606745042613, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151249144.24242425, "logits/rejected": 87403883.35483871, "logps/chosen": -287.5151515151515, "logps/rejected": -354.06451612903226, "loss": 0.1734, "rewards/chosen": 1.871212121212121, "rewards/margins": 9.726050830889541, "rewards/rejected": -7.854838709677419, "step": 1026 }, { "epoch": 0.7041480973603017, "grad_norm": 0.2079468776308766, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110084348.06153846, "logits/rejected": 83220317.46031746, "logps/chosen": -284.8, "logps/rejected": -381.968253968254, "loss": 0.1626, "rewards/chosen": 1.8634615384615385, "rewards/margins": 10.458699633699634, "rewards/rejected": -8.595238095238095, "step": 1027 }, { "epoch": 0.7048337332876242, "grad_norm": 0.16998041455059776, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96646216.11267605, "logits/rejected": 96836913.40350877, "logps/chosen": -241.35211267605635, "logps/rejected": -392.42105263157896, "loss": 0.1535, "rewards/chosen": 2.091549295774648, "rewards/margins": 11.302075611564122, "rewards/rejected": -9.210526315789474, "step": 1028 }, { "epoch": 0.7055193692149468, "grad_norm": 0.17303403062052924, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100913702.20895523, "logits/rejected": 89958366.4262295, "logps/chosen": -192.47761194029852, "logps/rejected": -392.39344262295083, "loss": 0.1701, "rewards/chosen": 1.6026119402985075, "rewards/margins": 9.496054563249327, "rewards/rejected": -7.89344262295082, "step": 1029 }, { "epoch": 0.7062050051422695, "grad_norm": 0.2025815003929466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120297501.68115942, "logits/rejected": 94514019.79661018, "logps/chosen": -289.6231884057971, "logps/rejected": -385.6271186440678, "loss": 0.1577, "rewards/chosen": 1.8568840579710144, "rewards/margins": 9.74671456644559, "rewards/rejected": -7.889830508474576, "step": 1030 }, { "epoch": 0.7068906410695921, "grad_norm": 0.1677296053831904, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158664528.45714286, "logits/rejected": 88514277.51724137, "logps/chosen": -280.6857142857143, "logps/rejected": -322.7586206896552, "loss": 0.1577, "rewards/chosen": 1.4642857142857142, "rewards/margins": 8.145320197044335, "rewards/rejected": -6.681034482758621, "step": 1031 }, { "epoch": 0.7075762769969146, "grad_norm": 0.2116004428915325, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130582664.53333333, "logits/rejected": 87710298.35294117, "logps/chosen": -249.6, "logps/rejected": -408.94117647058823, "loss": 0.1505, "rewards/chosen": 1.9916666666666667, "rewards/margins": 10.565196078431374, "rewards/rejected": -8.573529411764707, "step": 1032 }, { "epoch": 0.7082619129242372, "grad_norm": 0.19221141822861323, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117178368.0, "logits/rejected": 84049920.0, "logps/chosen": -263.5, "logps/rejected": -356.5, "loss": 0.1897, "rewards/chosen": 1.064453125, "rewards/margins": 10.119140625, "rewards/rejected": -9.0546875, "step": 1033 }, { "epoch": 0.7089475488515599, "grad_norm": 0.15998636675729414, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134774784.0, "logits/rejected": 63373312.0, "logps/chosen": -255.5, "logps/rejected": -351.5, "loss": 0.1612, "rewards/chosen": 1.994140625, "rewards/margins": 10.548828125, "rewards/rejected": -8.5546875, "step": 1034 }, { "epoch": 0.7096331847788824, "grad_norm": 0.20803986351713974, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97502371.24637681, "logits/rejected": 86871855.72881356, "logps/chosen": -263.6521739130435, "logps/rejected": -345.22033898305085, "loss": 0.2051, "rewards/chosen": 1.5579710144927537, "rewards/margins": 9.524072709408008, "rewards/rejected": -7.966101694915254, "step": 1035 }, { "epoch": 0.710318820706205, "grad_norm": 0.2561145281050086, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131858432.0, "logits/rejected": 103677952.0, "logps/chosen": -294.5, "logps/rejected": -398.5, "loss": 0.181, "rewards/chosen": 1.37890625, "rewards/margins": 9.40234375, "rewards/rejected": -8.0234375, "step": 1036 }, { "epoch": 0.7110044566335276, "grad_norm": 0.1865449372653777, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126228577.52380952, "logits/rejected": 50799474.21538462, "logps/chosen": -192.12698412698413, "logps/rejected": -331.5692307692308, "loss": 0.1823, "rewards/chosen": 1.0317460317460319, "rewards/margins": 9.931746031746032, "rewards/rejected": -8.9, "step": 1037 }, { "epoch": 0.7116900925608501, "grad_norm": 0.16100953049841568, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 63911566.68852459, "logits/rejected": 102290936.35820895, "logps/chosen": -203.27868852459017, "logps/rejected": -382.56716417910445, "loss": 0.1508, "rewards/chosen": 1.7377049180327868, "rewards/margins": 10.021287007585025, "rewards/rejected": -8.283582089552239, "step": 1038 }, { "epoch": 0.7123757284881728, "grad_norm": 0.18270130379840127, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107786955.17460318, "logits/rejected": 68722057.84615384, "logps/chosen": -178.66666666666666, "logps/rejected": -353.96923076923076, "loss": 0.158, "rewards/chosen": 1.4533730158730158, "rewards/margins": 6.2379884004884, "rewards/rejected": -4.7846153846153845, "step": 1039 }, { "epoch": 0.7130613644154954, "grad_norm": 0.17511120005432612, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97655086.16393442, "logits/rejected": 118504738.3880597, "logps/chosen": -219.54098360655738, "logps/rejected": -413.13432835820896, "loss": 0.1585, "rewards/chosen": 1.4795081967213115, "rewards/margins": 9.472045510154148, "rewards/rejected": -7.992537313432836, "step": 1040 }, { "epoch": 0.713747000342818, "grad_norm": 0.21925849319835003, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137153740.8, "logits/rejected": 59337065.4117647, "logps/chosen": -254.4, "logps/rejected": -374.5882352941176, "loss": 0.1772, "rewards/chosen": 1.5854166666666667, "rewards/margins": 9.00453431372549, "rewards/rejected": -7.419117647058823, "step": 1041 }, { "epoch": 0.7144326362701405, "grad_norm": 0.191819375686719, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96339936.4923077, "logits/rejected": 128492170.15873016, "logps/chosen": -202.58461538461538, "logps/rejected": -349.968253968254, "loss": 0.1571, "rewards/chosen": 1.7019230769230769, "rewards/margins": 7.432081807081807, "rewards/rejected": -5.73015873015873, "step": 1042 }, { "epoch": 0.7151182721974632, "grad_norm": 0.21234269372767328, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96324360.8275862, "logits/rejected": 91735420.34285714, "logps/chosen": -204.82758620689654, "logps/rejected": -362.9714285714286, "loss": 0.1453, "rewards/chosen": 1.6831896551724137, "rewards/margins": 9.197475369458129, "rewards/rejected": -7.514285714285714, "step": 1043 }, { "epoch": 0.7158039081247858, "grad_norm": 0.18186222026118426, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147921531.58620688, "logits/rejected": 54128991.08571429, "logps/chosen": -266.8965517241379, "logps/rejected": -352.45714285714286, "loss": 0.1528, "rewards/chosen": 1.9439655172413792, "rewards/margins": 7.343965517241379, "rewards/rejected": -5.4, "step": 1044 }, { "epoch": 0.7164895440521083, "grad_norm": 0.15970390811277543, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113684418.86567163, "logits/rejected": 101900959.47540984, "logps/chosen": -213.97014925373134, "logps/rejected": -368.78688524590166, "loss": 0.1823, "rewards/chosen": 1.5629664179104477, "rewards/margins": 5.80886805725471, "rewards/rejected": -4.245901639344262, "step": 1045 }, { "epoch": 0.7171751799794309, "grad_norm": 0.18967308403728758, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144703488.0, "logits/rejected": 39920786.28571428, "logps/chosen": -230.22222222222223, "logps/rejected": -327.7142857142857, "loss": 0.1569, "rewards/chosen": 1.7222222222222223, "rewards/margins": 9.75793650793651, "rewards/rejected": -8.035714285714286, "step": 1046 }, { "epoch": 0.7178608159067535, "grad_norm": 0.17569395529059048, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158875151.5151515, "logits/rejected": 52699400.258064516, "logps/chosen": -330.90909090909093, "logps/rejected": -410.3225806451613, "loss": 0.1576, "rewards/chosen": 2.803030303030303, "rewards/margins": 11.278836754643207, "rewards/rejected": -8.475806451612904, "step": 1047 }, { "epoch": 0.7185464518340761, "grad_norm": 0.21527931265968092, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141467142.32098764, "logits/rejected": 74777970.38297872, "logps/chosen": -288.7901234567901, "logps/rejected": -326.1276595744681, "loss": 0.223, "rewards/chosen": 1.4135802469135803, "rewards/margins": 8.966771736275282, "rewards/rejected": -7.553191489361702, "step": 1048 }, { "epoch": 0.7192320877613987, "grad_norm": 0.21528646919208042, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111466806.3030303, "logits/rejected": 116493411.09677419, "logps/chosen": -219.15151515151516, "logps/rejected": -394.3225806451613, "loss": 0.1841, "rewards/chosen": 1.415719696969697, "rewards/margins": 9.327010019550341, "rewards/rejected": -7.911290322580645, "step": 1049 }, { "epoch": 0.7199177236887213, "grad_norm": 0.19047432131288305, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79930088.72727273, "logits/rejected": 111554956.38709678, "logps/chosen": -209.6969696969697, "logps/rejected": -433.80645161290323, "loss": 0.1597, "rewards/chosen": 1.316287878787879, "rewards/margins": 9.945320136852395, "rewards/rejected": -8.629032258064516, "step": 1050 }, { "epoch": 0.7206033596160438, "grad_norm": 0.20720386458011808, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120552414.96774194, "logits/rejected": 51507324.121212125, "logps/chosen": -267.35483870967744, "logps/rejected": -328.24242424242425, "loss": 0.1681, "rewards/chosen": 1.8165322580645162, "rewards/margins": 9.755926197458455, "rewards/rejected": -7.9393939393939394, "step": 1051 }, { "epoch": 0.7212889955433665, "grad_norm": 0.189133289819101, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162604178.2857143, "logits/rejected": 121867832.8888889, "logps/chosen": -309.14285714285717, "logps/rejected": -462.22222222222223, "loss": 0.1551, "rewards/chosen": 1.7678571428571428, "rewards/margins": 9.281746031746032, "rewards/rejected": -7.513888888888889, "step": 1052 }, { "epoch": 0.7219746314706891, "grad_norm": 0.2063354120017306, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106788311.36507936, "logits/rejected": 97243325.04615384, "logps/chosen": -241.26984126984127, "logps/rejected": -388.9230769230769, "loss": 0.1797, "rewards/chosen": 1.1031746031746033, "rewards/margins": 10.441636141636142, "rewards/rejected": -9.338461538461539, "step": 1053 }, { "epoch": 0.7226602673980117, "grad_norm": 0.20459600213242113, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85012907.94029851, "logits/rejected": 47650043.80327869, "logps/chosen": -190.08955223880596, "logps/rejected": -290.88524590163934, "loss": 0.1733, "rewards/chosen": 1.5093283582089552, "rewards/margins": 7.30441032542207, "rewards/rejected": -5.795081967213115, "step": 1054 }, { "epoch": 0.7233459033253342, "grad_norm": 0.23162252960630558, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118279372.8, "logits/rejected": 123485244.23529412, "logps/chosen": -264.8, "logps/rejected": -444.2352941176471, "loss": 0.1352, "rewards/chosen": 2.3229166666666665, "rewards/margins": 8.800857843137255, "rewards/rejected": -6.477941176470588, "step": 1055 }, { "epoch": 0.7240315392526568, "grad_norm": 0.17287823462209517, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151693994.66666666, "logits/rejected": 67263066.35294117, "logps/chosen": -234.13333333333333, "logps/rejected": -332.0, "loss": 0.188, "rewards/chosen": 1.85, "rewards/margins": 8.680882352941177, "rewards/rejected": -6.830882352941177, "step": 1056 }, { "epoch": 0.7247171751799795, "grad_norm": 0.15163068229140747, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137115610.76363635, "logits/rejected": 89947711.12328768, "logps/chosen": -255.12727272727273, "logps/rejected": -411.6164383561644, "loss": 0.1331, "rewards/chosen": 1.6227272727272728, "rewards/margins": 10.122727272727273, "rewards/rejected": -8.5, "step": 1057 }, { "epoch": 0.725402811107302, "grad_norm": 0.20645001824633868, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137232384.0, "logits/rejected": 32669696.0, "logps/chosen": -202.25, "logps/rejected": -334.5, "loss": 0.2032, "rewards/chosen": 0.8818359375, "rewards/margins": 9.9755859375, "rewards/rejected": -9.09375, "step": 1058 }, { "epoch": 0.7260884470346246, "grad_norm": 0.19170730784897336, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81825720.14035088, "logits/rejected": 116849764.95774648, "logps/chosen": -200.42105263157896, "logps/rejected": -419.6056338028169, "loss": 0.1616, "rewards/chosen": 1.0405701754385965, "rewards/margins": 9.434936372621694, "rewards/rejected": -8.394366197183098, "step": 1059 }, { "epoch": 0.7267740829619472, "grad_norm": 0.1918997839459118, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129279273.29032259, "logits/rejected": 50554073.21212121, "logps/chosen": -187.3548387096774, "logps/rejected": -391.27272727272725, "loss": 0.1866, "rewards/chosen": 1.1204637096774193, "rewards/margins": 10.037130376344086, "rewards/rejected": -8.916666666666666, "step": 1060 }, { "epoch": 0.7274597188892697, "grad_norm": 0.15520688361301047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140318533.8181818, "logits/rejected": 99310294.70967741, "logps/chosen": -227.63636363636363, "logps/rejected": -398.4516129032258, "loss": 0.1801, "rewards/chosen": 1.3977272727272727, "rewards/margins": 9.873533724340177, "rewards/rejected": -8.475806451612904, "step": 1061 }, { "epoch": 0.7281453548165924, "grad_norm": 0.16167375382818203, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101669928.96, "logits/rejected": 108406626.46153846, "logps/chosen": -226.56, "logps/rejected": -389.7435897435897, "loss": 0.1262, "rewards/chosen": 1.754375, "rewards/margins": 10.600528846153846, "rewards/rejected": -8.846153846153847, "step": 1062 }, { "epoch": 0.728830990743915, "grad_norm": 0.1525342659325797, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173830599.1111111, "logits/rejected": 109618701.83783785, "logps/chosen": -398.22222222222223, "logps/rejected": -400.0, "loss": 0.1355, "rewards/chosen": 2.396990740740741, "rewards/margins": 11.072666416416416, "rewards/rejected": -8.675675675675675, "step": 1063 }, { "epoch": 0.7295166266712376, "grad_norm": 0.1983072635926337, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139169336.8888889, "logits/rejected": 159121408.0, "logps/chosen": -239.22222222222223, "logps/rejected": -387.42857142857144, "loss": 0.1871, "rewards/chosen": 1.5616319444444444, "rewards/margins": 9.766989087301587, "rewards/rejected": -8.205357142857142, "step": 1064 }, { "epoch": 0.7302022625985601, "grad_norm": 0.156292241211658, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94450091.94029851, "logits/rejected": 137380645.7704918, "logps/chosen": -242.62686567164178, "logps/rejected": -393.44262295081967, "loss": 0.1817, "rewards/chosen": 1.8526119402985075, "rewards/margins": 7.680480792757525, "rewards/rejected": -5.827868852459017, "step": 1065 }, { "epoch": 0.7308878985258828, "grad_norm": 0.1895438153416712, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125459034.35294117, "logits/rejected": 63543705.6, "logps/chosen": -233.88235294117646, "logps/rejected": -343.73333333333335, "loss": 0.1954, "rewards/chosen": 1.6452205882352942, "rewards/margins": 9.128553921568628, "rewards/rejected": -7.483333333333333, "step": 1066 }, { "epoch": 0.7315735344532054, "grad_norm": 0.18254335261175378, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146451114.66666666, "logits/rejected": 52728393.14285714, "logps/chosen": -226.22222222222223, "logps/rejected": -444.57142857142856, "loss": 0.1997, "rewards/chosen": 1.7439236111111112, "rewards/margins": 10.538566468253968, "rewards/rejected": -8.794642857142858, "step": 1067 }, { "epoch": 0.7322591703805279, "grad_norm": 0.2152896069316519, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108251173.23636363, "logits/rejected": 98020309.91780822, "logps/chosen": -271.41818181818184, "logps/rejected": -348.05479452054794, "loss": 0.1492, "rewards/chosen": 1.518181818181818, "rewards/margins": 9.778455790784557, "rewards/rejected": -8.26027397260274, "step": 1068 }, { "epoch": 0.7329448063078505, "grad_norm": 0.19161888197738644, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116098334.72, "logits/rejected": 101612949.73584905, "logps/chosen": -223.78666666666666, "logps/rejected": -427.47169811320754, "loss": 0.1759, "rewards/chosen": 1.6575, "rewards/margins": 11.119764150943396, "rewards/rejected": -9.462264150943396, "step": 1069 }, { "epoch": 0.7336304422351732, "grad_norm": 0.20412464204570174, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137194330.83870968, "logits/rejected": 98375493.81818181, "logps/chosen": -242.06451612903226, "logps/rejected": -397.57575757575756, "loss": 0.1738, "rewards/chosen": 1.3518145161290323, "rewards/margins": 10.942723607038124, "rewards/rejected": -9.590909090909092, "step": 1070 }, { "epoch": 0.7343160781624957, "grad_norm": 0.16595061584555038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99803807.47540984, "logits/rejected": 93276312.8358209, "logps/chosen": -207.7377049180328, "logps/rejected": -373.0149253731343, "loss": 0.1516, "rewards/chosen": 1.6372950819672132, "rewards/margins": 10.808936873011989, "rewards/rejected": -9.171641791044776, "step": 1071 }, { "epoch": 0.7350017140898183, "grad_norm": 0.1556040302793269, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140064333.57575756, "logits/rejected": 92934276.12903225, "logps/chosen": -293.3333333333333, "logps/rejected": -411.0967741935484, "loss": 0.1428, "rewards/chosen": 2.2992424242424243, "rewards/margins": 9.10569403714565, "rewards/rejected": -6.806451612903226, "step": 1072 }, { "epoch": 0.7356873500171409, "grad_norm": 0.19557638538105346, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134273652.05333334, "logits/rejected": 72777109.73584905, "logps/chosen": -229.54666666666665, "logps/rejected": -378.5660377358491, "loss": 0.1864, "rewards/chosen": 1.6283333333333334, "rewards/margins": 6.241540880503145, "rewards/rejected": -4.613207547169812, "step": 1073 }, { "epoch": 0.7363729859444635, "grad_norm": 0.173678840636699, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131942851.2542373, "logits/rejected": 92457049.04347827, "logps/chosen": -227.52542372881356, "logps/rejected": -391.8840579710145, "loss": 0.1657, "rewards/chosen": 0.9830508474576272, "rewards/margins": 9.012036354704005, "rewards/rejected": -8.028985507246377, "step": 1074 }, { "epoch": 0.7370586218717861, "grad_norm": 0.1856648368495576, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159841112.43636364, "logits/rejected": 48579233.31506849, "logps/chosen": -230.1090909090909, "logps/rejected": -372.6027397260274, "loss": 0.1608, "rewards/chosen": 1.7181818181818183, "rewards/margins": 9.560647571606475, "rewards/rejected": -7.842465753424658, "step": 1075 }, { "epoch": 0.7377442577991087, "grad_norm": 0.1983821309139802, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76396251.42857143, "logits/rejected": 130963526.62068966, "logps/chosen": -241.82857142857142, "logps/rejected": -402.7586206896552, "loss": 0.1802, "rewards/chosen": 1.6669642857142857, "rewards/margins": 10.589378078817735, "rewards/rejected": -8.922413793103448, "step": 1076 }, { "epoch": 0.7384298937264313, "grad_norm": 0.20118057936323755, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131548625.45454545, "logits/rejected": 95115990.70967741, "logps/chosen": -232.0, "logps/rejected": -422.19354838709677, "loss": 0.1818, "rewards/chosen": 1.65625, "rewards/margins": 8.65625, "rewards/rejected": -7.0, "step": 1077 }, { "epoch": 0.7391155296537538, "grad_norm": 0.19293465113677954, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77307342.90410958, "logits/rejected": 120338394.76363637, "logps/chosen": -190.46575342465752, "logps/rejected": -377.6, "loss": 0.1835, "rewards/chosen": 1.6284246575342465, "rewards/margins": 10.39206102117061, "rewards/rejected": -8.763636363636364, "step": 1078 }, { "epoch": 0.7398011655810764, "grad_norm": 0.19511612881273785, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122100849.77777778, "logits/rejected": 65366614.646153845, "logps/chosen": -233.65079365079364, "logps/rejected": -394.83076923076925, "loss": 0.1714, "rewards/chosen": 1.6656746031746033, "rewards/margins": 8.488751526251527, "rewards/rejected": -6.823076923076923, "step": 1079 }, { "epoch": 0.7404868015083991, "grad_norm": 0.21513775388561795, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93832572.34285714, "logits/rejected": 76799152.55172414, "logps/chosen": -224.9142857142857, "logps/rejected": -396.13793103448273, "loss": 0.1625, "rewards/chosen": 1.9839285714285715, "rewards/margins": 8.130480295566501, "rewards/rejected": -6.146551724137931, "step": 1080 }, { "epoch": 0.7411724374357216, "grad_norm": 0.19457437645561088, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122112843.29411764, "logits/rejected": 70783249.06666666, "logps/chosen": -243.05882352941177, "logps/rejected": -347.73333333333335, "loss": 0.1964, "rewards/chosen": 1.6277573529411764, "rewards/margins": 7.919424019607844, "rewards/rejected": -6.291666666666667, "step": 1081 }, { "epoch": 0.7418580733630442, "grad_norm": 0.15756812565704403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102081957.64705883, "logits/rejected": 92982476.8, "logps/chosen": -193.76470588235293, "logps/rejected": -329.06666666666666, "loss": 0.1744, "rewards/chosen": 1.5294117647058822, "rewards/margins": 8.037745098039217, "rewards/rejected": -6.508333333333334, "step": 1082 }, { "epoch": 0.7425437092903668, "grad_norm": 0.19014050650604816, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112254827.05454545, "logits/rejected": 66419389.36986301, "logps/chosen": -202.47272727272727, "logps/rejected": -344.986301369863, "loss": 0.1867, "rewards/chosen": 0.9414772727272728, "rewards/margins": 9.366134806973848, "rewards/rejected": -8.424657534246576, "step": 1083 }, { "epoch": 0.7432293452176894, "grad_norm": 0.17164028550523927, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114900299.71830986, "logits/rejected": 65526801.96491228, "logps/chosen": -242.70422535211267, "logps/rejected": -375.0175438596491, "loss": 0.1686, "rewards/chosen": 1.3345070422535212, "rewards/margins": 7.957314059797381, "rewards/rejected": -6.62280701754386, "step": 1084 }, { "epoch": 0.743914981145012, "grad_norm": 0.18517060529841847, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144781160.2962963, "logits/rejected": 83602681.08108108, "logps/chosen": -225.03703703703704, "logps/rejected": -371.02702702702703, "loss": 0.1756, "rewards/chosen": 0.96875, "rewards/margins": 6.96875, "rewards/rejected": -6.0, "step": 1085 }, { "epoch": 0.7446006170723346, "grad_norm": 0.20235897253846483, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123088841.38666667, "logits/rejected": 118311027.9245283, "logps/chosen": -211.41333333333333, "logps/rejected": -404.37735849056605, "loss": 0.2012, "rewards/chosen": 1.4516666666666667, "rewards/margins": 10.961100628930817, "rewards/rejected": -9.50943396226415, "step": 1086 }, { "epoch": 0.7452862529996572, "grad_norm": 0.20669981801286713, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116537123.44615385, "logits/rejected": 97201330.79365079, "logps/chosen": -185.84615384615384, "logps/rejected": -378.92063492063494, "loss": 0.1828, "rewards/chosen": 0.9298076923076923, "rewards/margins": 10.175839438339437, "rewards/rejected": -9.246031746031745, "step": 1087 }, { "epoch": 0.7459718889269797, "grad_norm": 0.1509962953891768, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 63064697.018181816, "logits/rejected": 112930198.79452054, "logps/chosen": -211.78181818181818, "logps/rejected": -367.7808219178082, "loss": 0.1411, "rewards/chosen": 1.4625, "rewards/margins": 9.640582191780823, "rewards/rejected": -8.178082191780822, "step": 1088 }, { "epoch": 0.7466575248543024, "grad_norm": 0.2342705242870241, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139943285.84126985, "logits/rejected": 68496210.70769231, "logps/chosen": -269.7142857142857, "logps/rejected": -446.5230769230769, "loss": 0.1677, "rewards/chosen": 1.5813492063492063, "rewards/margins": 10.258272283272282, "rewards/rejected": -8.676923076923076, "step": 1089 }, { "epoch": 0.747343160781625, "grad_norm": 0.15053503114558298, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153282746.1818182, "logits/rejected": 78542651.61643836, "logps/chosen": -329.3090909090909, "logps/rejected": -384.0, "loss": 0.1515, "rewards/chosen": 1.7159090909090908, "rewards/margins": 10.962484433374843, "rewards/rejected": -9.246575342465754, "step": 1090 }, { "epoch": 0.7480287967089475, "grad_norm": 0.20042793520597366, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 66399967.549295776, "logits/rejected": 167036317.19298247, "logps/chosen": -210.92957746478874, "logps/rejected": -404.2105263157895, "loss": 0.1843, "rewards/chosen": 1.5616197183098592, "rewards/margins": 8.859865332344947, "rewards/rejected": -7.298245614035087, "step": 1091 }, { "epoch": 0.7487144326362701, "grad_norm": 0.26027897095857244, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116423711.03030303, "logits/rejected": 56301766.19354839, "logps/chosen": -251.87878787878788, "logps/rejected": -361.5483870967742, "loss": 0.1739, "rewards/chosen": 1.0880681818181819, "rewards/margins": 9.870326246334312, "rewards/rejected": -8.78225806451613, "step": 1092 }, { "epoch": 0.7494000685635928, "grad_norm": 0.20618667788507172, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130023424.0, "logits/rejected": 107679222.69090909, "logps/chosen": -234.95890410958904, "logps/rejected": -411.92727272727274, "loss": 0.1959, "rewards/chosen": 1.3681506849315068, "rewards/margins": 9.595423412204234, "rewards/rejected": -8.227272727272727, "step": 1093 }, { "epoch": 0.7500857044909153, "grad_norm": 0.21404852585286316, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84427280.51612903, "logits/rejected": 83695429.81818181, "logps/chosen": -158.58064516129033, "logps/rejected": -363.6363636363636, "loss": 0.1775, "rewards/chosen": 1.466733870967742, "rewards/margins": 8626965.830370234, "rewards/rejected": -8626964.363636363, "step": 1094 }, { "epoch": 0.7507713404182379, "grad_norm": 0.18191347393394314, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139810133.33333334, "logits/rejected": 70994763.29411764, "logps/chosen": -325.6, "logps/rejected": -363.29411764705884, "loss": 0.1487, "rewards/chosen": 1.8166666666666667, "rewards/margins": 10.346078431372549, "rewards/rejected": -8.529411764705882, "step": 1095 }, { "epoch": 0.7514569763455605, "grad_norm": 0.20655719281083731, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 71860224.0, "logits/rejected": 136708096.0, "logps/chosen": -198.125, "logps/rejected": -425.5, "loss": 0.1739, "rewards/chosen": 1.3779296875, "rewards/margins": 10.5107421875, "rewards/rejected": -9.1328125, "step": 1096 }, { "epoch": 0.7521426122728831, "grad_norm": 0.1629133957592733, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122176016.51612903, "logits/rejected": 94689590.3030303, "logps/chosen": -248.25806451612902, "logps/rejected": -367.5151515151515, "loss": 0.1579, "rewards/chosen": 1.752016129032258, "rewards/margins": 9.986864613880742, "rewards/rejected": -8.234848484848484, "step": 1097 }, { "epoch": 0.7528282482002057, "grad_norm": 0.19119602356363838, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92849250.19178082, "logits/rejected": 138640812.21818182, "logps/chosen": -219.17808219178082, "logps/rejected": -455.56363636363636, "loss": 0.1945, "rewards/chosen": 1.1806506849315068, "rewards/margins": 8.53519613947696, "rewards/rejected": -7.3545454545454545, "step": 1098 }, { "epoch": 0.7535138841275283, "grad_norm": 0.21620837846251922, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 50192865.88235294, "logits/rejected": 123172727.46666667, "logps/chosen": -184.23529411764707, "logps/rejected": -330.6666666666667, "loss": 0.2072, "rewards/chosen": 1.1066176470588236, "rewards/margins": 7.764950980392157, "rewards/rejected": -6.658333333333333, "step": 1099 }, { "epoch": 0.7541995200548509, "grad_norm": 0.24209473029745662, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97307852.8, "logits/rejected": 87718806.06896552, "logps/chosen": -246.4, "logps/rejected": -423.17241379310343, "loss": 0.2022, "rewards/chosen": 1.4160714285714286, "rewards/margins": 9.510899014778325, "rewards/rejected": -8.094827586206897, "step": 1100 }, { "epoch": 0.7548851559821734, "grad_norm": 0.1701129058136448, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125295945.76271187, "logits/rejected": 80208465.6231884, "logps/chosen": -266.8474576271187, "logps/rejected": -388.6376811594203, "loss": 0.1568, "rewards/chosen": 0.890625, "rewards/margins": 8.332653985507246, "rewards/rejected": -7.442028985507246, "step": 1101 }, { "epoch": 0.755570791909496, "grad_norm": 0.19196199263927027, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139329536.0, "logits/rejected": 79986688.0, "logps/chosen": -204.75, "logps/rejected": -388.5, "loss": 0.1644, "rewards/chosen": 1.5361328125, "rewards/margins": 10.5830078125, "rewards/rejected": -9.046875, "step": 1102 }, { "epoch": 0.7562564278368187, "grad_norm": 0.23113143028271205, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96955288.11594203, "logits/rejected": 114063741.83050847, "logps/chosen": -261.1014492753623, "logps/rejected": -410.5762711864407, "loss": 0.1822, "rewards/chosen": 1.1458333333333333, "rewards/margins": 9.594985875706215, "rewards/rejected": -8.44915254237288, "step": 1103 }, { "epoch": 0.7569420637641412, "grad_norm": 0.23890168151350213, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140422999.67123288, "logits/rejected": 67947724.8, "logps/chosen": -302.4657534246575, "logps/rejected": -357.8181818181818, "loss": 0.2131, "rewards/chosen": 1.102097602739726, "rewards/margins": 10.393006693648816, "rewards/rejected": -9.290909090909091, "step": 1104 }, { "epoch": 0.7576276996914638, "grad_norm": 0.225895871722442, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98855406.34482759, "logits/rejected": 114564417.82857142, "logps/chosen": -199.44827586206895, "logps/rejected": -381.25714285714287, "loss": 0.1597, "rewards/chosen": 1.050646551724138, "rewards/margins": 9.950646551724137, "rewards/rejected": -8.9, "step": 1105 }, { "epoch": 0.7583133356187864, "grad_norm": 0.18045081715586336, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140788804.26666668, "logits/rejected": 85921551.05882353, "logps/chosen": -258.6666666666667, "logps/rejected": -341.88235294117646, "loss": 0.1609, "rewards/chosen": 1.353125, "rewards/margins": 9.551654411764707, "rewards/rejected": -8.198529411764707, "step": 1106 }, { "epoch": 0.7589989715461091, "grad_norm": 0.18471014933884733, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146736112.24615383, "logits/rejected": 72834421.84126984, "logps/chosen": -256.24615384615385, "logps/rejected": -402.2857142857143, "loss": 0.1625, "rewards/chosen": 2.025, "rewards/margins": 10.739285714285714, "rewards/rejected": -8.714285714285714, "step": 1107 }, { "epoch": 0.7596846074734316, "grad_norm": 0.1758103231041167, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129913047.57894737, "logits/rejected": 72029105.23076923, "logps/chosen": -228.8421052631579, "logps/rejected": -415.38461538461536, "loss": 0.2013, "rewards/chosen": 1.149671052631579, "rewards/margins": 7.226594129554655, "rewards/rejected": -6.076923076923077, "step": 1108 }, { "epoch": 0.7603702434007542, "grad_norm": 0.17942724668807622, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95464722.02816902, "logits/rejected": 68102251.78947368, "logps/chosen": -215.66197183098592, "logps/rejected": -322.8070175438597, "loss": 0.155, "rewards/chosen": 1.806338028169014, "rewards/margins": 10.762478379046208, "rewards/rejected": -8.956140350877194, "step": 1109 }, { "epoch": 0.7610558793280768, "grad_norm": 0.23888291848909282, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68716680.53333333, "logits/rejected": 80740352.0, "logps/chosen": -194.13333333333333, "logps/rejected": -363.52941176470586, "loss": 0.1418, "rewards/chosen": 1.7875, "rewards/margins": 10.441911764705882, "rewards/rejected": -8.654411764705882, "step": 1110 }, { "epoch": 0.7617415152553993, "grad_norm": 0.18967595257922812, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102170624.0, "logits/rejected": 84148224.0, "logps/chosen": -199.875, "logps/rejected": -401.5, "loss": 0.1605, "rewards/chosen": 1.623046875, "rewards/margins": 10.904296875, "rewards/rejected": -9.28125, "step": 1111 }, { "epoch": 0.762427151182722, "grad_norm": 0.1431578291776948, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89491195.34545454, "logits/rejected": 111321424.65753424, "logps/chosen": -184.87272727272727, "logps/rejected": -334.90410958904107, "loss": 0.1555, "rewards/chosen": 1.3, "rewards/margins": 8.163013698630138, "rewards/rejected": -6.863013698630137, "step": 1112 }, { "epoch": 0.7631127871100446, "grad_norm": 0.19554641722926674, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112821109.62162162, "logits/rejected": 83653063.1111111, "logps/chosen": -183.78378378378378, "logps/rejected": -361.48148148148147, "loss": 0.1873, "rewards/chosen": 1.5287162162162162, "rewards/margins": 8.547234734734735, "rewards/rejected": -7.018518518518518, "step": 1113 }, { "epoch": 0.7637984230373671, "grad_norm": 0.18377429657862232, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121841093.24590164, "logits/rejected": 66107239.1641791, "logps/chosen": -256.5245901639344, "logps/rejected": -372.05970149253733, "loss": 0.1558, "rewards/chosen": 1.9262295081967213, "rewards/margins": 10.582945926107168, "rewards/rejected": -8.656716417910447, "step": 1114 }, { "epoch": 0.7644840589646897, "grad_norm": 0.18788155545852842, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118163667.86206897, "logits/rejected": 95180741.48571429, "logps/chosen": -229.6551724137931, "logps/rejected": -361.6, "loss": 0.1687, "rewards/chosen": 1.2747844827586208, "rewards/margins": 9.617641625615764, "rewards/rejected": -8.342857142857143, "step": 1115 }, { "epoch": 0.7651696948920124, "grad_norm": 0.16646343722992882, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167497123.67213115, "logits/rejected": 65027362.3880597, "logps/chosen": -289.3114754098361, "logps/rejected": -350.8059701492537, "loss": 0.1511, "rewards/chosen": 1.8514344262295082, "rewards/margins": 10.090240396378762, "rewards/rejected": -8.238805970149254, "step": 1116 }, { "epoch": 0.7658553308193349, "grad_norm": 0.18307597031741443, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100967231.07246377, "logits/rejected": 100076804.33898304, "logps/chosen": -241.97101449275362, "logps/rejected": -360.135593220339, "loss": 0.1728, "rewards/chosen": 1.5706521739130435, "rewards/margins": 9.901160648489315, "rewards/rejected": -8.330508474576272, "step": 1117 }, { "epoch": 0.7665409667466575, "grad_norm": 0.15412868356970175, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 69881370.03389831, "logits/rejected": 91058947.71014492, "logps/chosen": -206.10169491525423, "logps/rejected": -372.8695652173913, "loss": 0.1437, "rewards/chosen": 1.6546610169491525, "rewards/margins": 10.372052321296978, "rewards/rejected": -8.717391304347826, "step": 1118 }, { "epoch": 0.7672266026739801, "grad_norm": 0.17709521156899857, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136866762.10526314, "logits/rejected": 96232693.18309858, "logps/chosen": -296.280701754386, "logps/rejected": -364.16901408450707, "loss": 0.1662, "rewards/chosen": 1.6018366228070176, "rewards/margins": 8.517329580553497, "rewards/rejected": -6.915492957746479, "step": 1119 }, { "epoch": 0.7679122386013028, "grad_norm": 0.19366585854526466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115201180.20338982, "logits/rejected": 100025032.3478261, "logps/chosen": -268.20338983050846, "logps/rejected": -356.6376811594203, "loss": 0.1648, "rewards/chosen": 1.7854872881355932, "rewards/margins": 9.684038012773275, "rewards/rejected": -7.898550724637682, "step": 1120 }, { "epoch": 0.7685978745286253, "grad_norm": 0.20516599703109092, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80704194.20689656, "logits/rejected": 116182220.8, "logps/chosen": -241.6551724137931, "logps/rejected": -382.1714285714286, "loss": 0.1551, "rewards/chosen": 1.7025862068965518, "rewards/margins": 9.70972906403941, "rewards/rejected": -8.007142857142858, "step": 1121 }, { "epoch": 0.7692835104559479, "grad_norm": 0.16635038650479994, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 175644545.96923077, "logits/rejected": 32422635.682539683, "logps/chosen": -297.6, "logps/rejected": -357.58730158730157, "loss": 0.1914, "rewards/chosen": 1.1692307692307693, "rewards/margins": 10.383516483516482, "rewards/rejected": -9.214285714285714, "step": 1122 }, { "epoch": 0.7699691463832705, "grad_norm": 0.2081801382779808, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102794827.5409836, "logits/rejected": 59095865.313432835, "logps/chosen": -235.54098360655738, "logps/rejected": -372.53731343283584, "loss": 0.1745, "rewards/chosen": 1.4385245901639345, "rewards/margins": 10.490763396134083, "rewards/rejected": -9.052238805970148, "step": 1123 }, { "epoch": 0.770654782310593, "grad_norm": 0.17035888665166823, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114477868.6984127, "logits/rejected": 101179518.03076923, "logps/chosen": -264.25396825396825, "logps/rejected": -362.83076923076925, "loss": 0.1871, "rewards/chosen": 1.2862103174603174, "rewards/margins": 10.11697954822955, "rewards/rejected": -8.830769230769231, "step": 1124 }, { "epoch": 0.7713404182379157, "grad_norm": 0.20593517307401416, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73236480.0, "logits/rejected": 99131392.0, "logps/chosen": -195.0, "logps/rejected": -389.5, "loss": 0.1554, "rewards/chosen": 1.845703125, "rewards/margins": 10.392578125, "rewards/rejected": -8.546875, "step": 1125 }, { "epoch": 0.7720260541652383, "grad_norm": 0.19047494954114436, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115823430.93975903, "logits/rejected": 44797496.88888889, "logps/chosen": -199.32530120481928, "logps/rejected": -406.0444444444444, "loss": 0.2205, "rewards/chosen": 1.3885542168674698, "rewards/margins": 7.021887550200804, "rewards/rejected": -5.633333333333334, "step": 1126 }, { "epoch": 0.7727116900925608, "grad_norm": 0.16860452075764237, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 61369290.10526316, "logits/rejected": 120512396.61971831, "logps/chosen": -194.94736842105263, "logps/rejected": -385.80281690140845, "loss": 0.1378, "rewards/chosen": 0.8092105263157895, "rewards/margins": 9.499351371386211, "rewards/rejected": -8.690140845070422, "step": 1127 }, { "epoch": 0.7733973260198834, "grad_norm": 0.20343175627708132, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106255701.33333333, "logits/rejected": 87757745.23076923, "logps/chosen": -217.3968253968254, "logps/rejected": -411.5692307692308, "loss": 0.1425, "rewards/chosen": 1.6339285714285714, "rewards/margins": 11.857005494505493, "rewards/rejected": -10.223076923076922, "step": 1128 }, { "epoch": 0.774082961947206, "grad_norm": 0.21797648098745476, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121568239.74603175, "logits/rejected": 79917623.13846155, "logps/chosen": -256.5079365079365, "logps/rejected": -396.8, "loss": 0.1424, "rewards/chosen": 1.8095238095238095, "rewards/margins": 11.024908424908425, "rewards/rejected": -9.215384615384615, "step": 1129 }, { "epoch": 0.7747685978745287, "grad_norm": 0.25908574482041896, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131858432.0, "logits/rejected": 113311744.0, "logps/chosen": -243.75, "logps/rejected": -393.5, "loss": 0.2093, "rewards/chosen": 1.251953125, "rewards/margins": 9.150390625, "rewards/rejected": -7.8984375, "step": 1130 }, { "epoch": 0.7754542338018512, "grad_norm": 0.24133873903794423, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131380404.70588236, "logits/rejected": 8860467.2, "logps/chosen": -236.0, "logps/rejected": -322.1333333333333, "loss": 0.1898, "rewards/chosen": 1.228860294117647, "rewards/margins": 9.595526960784314, "rewards/rejected": -8.366666666666667, "step": 1131 }, { "epoch": 0.7761398697291738, "grad_norm": 0.16998147509631337, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141255627.93220338, "logits/rejected": 109902922.20289855, "logps/chosen": -263.0508474576271, "logps/rejected": -399.768115942029, "loss": 0.1491, "rewards/chosen": 1.0985169491525424, "rewards/margins": 10.518806804225004, "rewards/rejected": -9.420289855072463, "step": 1132 }, { "epoch": 0.7768255056564964, "grad_norm": 0.19470299423818066, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142961785.4915254, "logits/rejected": 48508037.56521739, "logps/chosen": -183.45762711864407, "logps/rejected": -359.42028985507244, "loss": 0.1432, "rewards/chosen": 1.4798728813559323, "rewards/margins": 10.117554040776222, "rewards/rejected": -8.63768115942029, "step": 1133 }, { "epoch": 0.777511141583819, "grad_norm": 0.20079222752608653, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99661671.1641791, "logits/rejected": 106688310.55737706, "logps/chosen": -206.80597014925374, "logps/rejected": -455.8688524590164, "loss": 0.141, "rewards/chosen": 1.296641791044776, "rewards/margins": 11.09172375825789, "rewards/rejected": -9.795081967213115, "step": 1134 }, { "epoch": 0.7781967775111416, "grad_norm": 0.19121977353193742, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130539646.03076923, "logits/rejected": 100197262.22222222, "logps/chosen": -268.0615384615385, "logps/rejected": -344.8888888888889, "loss": 0.1865, "rewards/chosen": 0.8259615384615384, "rewards/margins": 9.103739316239317, "rewards/rejected": -8.277777777777779, "step": 1135 }, { "epoch": 0.7788824134384642, "grad_norm": 0.18412092855321993, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98633794.06451613, "logits/rejected": 110831305.6969697, "logps/chosen": -186.70967741935485, "logps/rejected": -412.6060606060606, "loss": 0.1684, "rewards/chosen": 1.4959677419354838, "rewards/margins": 10.011119257087, "rewards/rejected": -8.515151515151516, "step": 1136 }, { "epoch": 0.7795680493657867, "grad_norm": 0.17556762228632833, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91174542.68852459, "logits/rejected": 121822620.65671642, "logps/chosen": -206.0327868852459, "logps/rejected": -351.5223880597015, "loss": 0.1723, "rewards/chosen": 0.8729508196721312, "rewards/margins": 9.044592610716906, "rewards/rejected": -8.171641791044776, "step": 1137 }, { "epoch": 0.7802536852931093, "grad_norm": 0.16909684029187158, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142352135.75757575, "logits/rejected": 47484003.09677419, "logps/chosen": -214.3030303030303, "logps/rejected": -355.0967741935484, "loss": 0.1648, "rewards/chosen": 1.7026515151515151, "rewards/margins": 7.759103128054742, "rewards/rejected": -6.056451612903226, "step": 1138 }, { "epoch": 0.780939321220432, "grad_norm": 0.1758494210795745, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92900703.52238806, "logits/rejected": 92962278.81967214, "logps/chosen": -272.4776119402985, "logps/rejected": -359.60655737704917, "loss": 0.144, "rewards/chosen": 2.16044776119403, "rewards/margins": 8.685037925128455, "rewards/rejected": -6.524590163934426, "step": 1139 }, { "epoch": 0.7816249571477546, "grad_norm": 0.28789282860008947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74804345.49152543, "logits/rejected": 129597914.89855072, "logps/chosen": -208.8135593220339, "logps/rejected": -402.0869565217391, "loss": 0.1512, "rewards/chosen": 1.396186440677966, "rewards/margins": 10.367200933431588, "rewards/rejected": -8.971014492753623, "step": 1140 }, { "epoch": 0.7823105930750771, "grad_norm": 0.1553509515274617, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73788681.48148148, "logits/rejected": 141132661.6216216, "logps/chosen": -237.03703703703704, "logps/rejected": -403.8918918918919, "loss": 0.1477, "rewards/chosen": 1.7164351851851851, "rewards/margins": 8771447.338056806, "rewards/rejected": -8771445.621621622, "step": 1141 }, { "epoch": 0.7829962290023997, "grad_norm": 0.20208342206932137, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115474432.0, "logits/rejected": 74366976.0, "logps/chosen": -224.0, "logps/rejected": -426.0, "loss": 0.1549, "rewards/chosen": 1.224609375, "rewards/margins": 11.435546875, "rewards/rejected": -10.2109375, "step": 1142 }, { "epoch": 0.7836818649297224, "grad_norm": 0.20363778433593635, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119431029.15254237, "logits/rejected": 60756620.98550725, "logps/chosen": -260.6101694915254, "logps/rejected": -405.3333333333333, "loss": 0.1693, "rewards/chosen": 1.6271186440677967, "rewards/margins": 10.293785310734464, "rewards/rejected": -8.666666666666666, "step": 1143 }, { "epoch": 0.7843675008570449, "grad_norm": 0.17080419520821963, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86549130.15873016, "logits/rejected": 106212682.83076923, "logps/chosen": -195.55555555555554, "logps/rejected": -397.2923076923077, "loss": 0.1546, "rewards/chosen": 1.3869047619047619, "rewards/margins": 10.133058608058608, "rewards/rejected": -8.746153846153845, "step": 1144 }, { "epoch": 0.7850531367843675, "grad_norm": 0.2020969809799504, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147266673.7777778, "logits/rejected": 55235757.29230769, "logps/chosen": -229.0793650793651, "logps/rejected": -396.3076923076923, "loss": 0.143, "rewards/chosen": 2.0456349206349205, "rewards/margins": 11.530250305250306, "rewards/rejected": -9.484615384615385, "step": 1145 }, { "epoch": 0.7857387727116901, "grad_norm": 0.14147528009549867, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119255957.01492538, "logits/rejected": 61896066.09836066, "logps/chosen": -244.53731343283582, "logps/rejected": -377.1803278688525, "loss": 0.1372, "rewards/chosen": 2.091417910447761, "rewards/margins": 10.484860533398582, "rewards/rejected": -8.39344262295082, "step": 1146 }, { "epoch": 0.7864244086390126, "grad_norm": 0.1805604754872765, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85627465.14285715, "logits/rejected": 89594993.77777778, "logps/chosen": -210.0, "logps/rejected": -350.6666666666667, "loss": 0.1413, "rewards/chosen": 1.7165178571428572, "rewards/margins": 8.202628968253968, "rewards/rejected": -6.486111111111111, "step": 1147 }, { "epoch": 0.7871100445663353, "grad_norm": 0.24581562709904678, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97482615.46666667, "logits/rejected": 99013330.8235294, "logps/chosen": -220.4, "logps/rejected": -340.70588235294116, "loss": 0.1591, "rewards/chosen": 1.515625, "rewards/margins": 9.699448529411764, "rewards/rejected": -8.183823529411764, "step": 1148 }, { "epoch": 0.7877956804936579, "grad_norm": 0.17532340442827382, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94574790.19354838, "logits/rejected": 88842984.72727273, "logps/chosen": -190.70967741935485, "logps/rejected": -341.3333333333333, "loss": 0.1654, "rewards/chosen": 1.2872983870967742, "rewards/margins": 7.219116568914956, "rewards/rejected": -5.931818181818182, "step": 1149 }, { "epoch": 0.7884813164209804, "grad_norm": 0.2464083699518122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 198301192.39344263, "logits/rejected": 65168215.88059702, "logps/chosen": -303.73770491803276, "logps/rejected": -384.0, "loss": 0.171, "rewards/chosen": 1.4702868852459017, "rewards/margins": 9.89566001957426, "rewards/rejected": -8.425373134328359, "step": 1150 }, { "epoch": 0.789166952348303, "grad_norm": 0.17054978088597647, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129499136.0, "logits/rejected": 78544896.0, "logps/chosen": -245.25, "logps/rejected": -396.0, "loss": 0.1843, "rewards/chosen": 1.427734375, "rewards/margins": 8.005859375, "rewards/rejected": -6.578125, "step": 1151 }, { "epoch": 0.7898525882756257, "grad_norm": 0.22918442402833825, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116557500.63157895, "logits/rejected": 64391427.6056338, "logps/chosen": -269.05263157894734, "logps/rejected": -345.6901408450704, "loss": 0.1364, "rewards/chosen": 1.8157894736842106, "rewards/margins": 11.29466271312083, "rewards/rejected": -9.47887323943662, "step": 1152 }, { "epoch": 0.7905382242029483, "grad_norm": 0.29298741022648855, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105708618.20289855, "logits/rejected": 102085093.96610169, "logps/chosen": -201.97101449275362, "logps/rejected": -411.66101694915255, "loss": 0.1776, "rewards/chosen": 1.3876811594202898, "rewards/margins": 10.591070989928763, "rewards/rejected": -9.203389830508474, "step": 1153 }, { "epoch": 0.7912238601302708, "grad_norm": 0.428119569612447, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122949448.59701492, "logits/rejected": 68793461.50819673, "logps/chosen": -289.43283582089555, "logps/rejected": -403.9344262295082, "loss": 0.1697, "rewards/chosen": 1.474813432835821, "rewards/margins": 10.83546917054074, "rewards/rejected": -9.360655737704919, "step": 1154 }, { "epoch": 0.7919094960575934, "grad_norm": 0.18511545267672358, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129630208.0, "logits/rejected": 110362624.0, "logps/chosen": -335.5, "logps/rejected": -431.0, "loss": 0.1685, "rewards/chosen": 1.97265625, "rewards/margins": 12.00390625, "rewards/rejected": -10.03125, "step": 1155 }, { "epoch": 0.792595131984916, "grad_norm": 0.1629691181710122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119847079.86885247, "logits/rejected": 71929183.52238806, "logps/chosen": -264.91803278688525, "logps/rejected": -369.1940298507463, "loss": 0.1637, "rewards/chosen": 1.1613729508196722, "rewards/margins": 9.527044592610716, "rewards/rejected": -8.365671641791044, "step": 1156 }, { "epoch": 0.7932807679122386, "grad_norm": 0.18656507079627876, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136734310.4, "logits/rejected": 81172118.58823529, "logps/chosen": -309.6, "logps/rejected": -375.52941176470586, "loss": 0.1414, "rewards/chosen": 2.457291666666667, "rewards/margins": 10.596997549019608, "rewards/rejected": -8.139705882352942, "step": 1157 }, { "epoch": 0.7939664038395612, "grad_norm": 0.15771782748234908, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133152507.93650794, "logits/rejected": 67366975.01538461, "logps/chosen": -212.57142857142858, "logps/rejected": -385.7230769230769, "loss": 0.1561, "rewards/chosen": 1.3194444444444444, "rewards/margins": 9.550213675213675, "rewards/rejected": -8.23076923076923, "step": 1158 }, { "epoch": 0.7946520397668838, "grad_norm": 0.18041182270731604, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148897792.0, "logits/rejected": 77489766.4, "logps/chosen": -266.2068965517241, "logps/rejected": -415.0857142857143, "loss": 0.1528, "rewards/chosen": 1.8857758620689655, "rewards/margins": 10.664347290640395, "rewards/rejected": -8.778571428571428, "step": 1159 }, { "epoch": 0.7953376756942063, "grad_norm": 0.19366934769407204, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126774315.26760563, "logits/rejected": 60357506.24561404, "logps/chosen": -270.6478873239437, "logps/rejected": -332.0701754385965, "loss": 0.1894, "rewards/chosen": 1.460387323943662, "rewards/margins": 7.714773288855943, "rewards/rejected": -6.254385964912281, "step": 1160 }, { "epoch": 0.796023311621529, "grad_norm": 0.18039536777135723, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120666899.6923077, "logits/rejected": 42941683.809523806, "logps/chosen": -215.3846153846154, "logps/rejected": -380.44444444444446, "loss": 0.1628, "rewards/chosen": 1.751923076923077, "rewards/margins": 10.24398656898657, "rewards/rejected": -8.492063492063492, "step": 1161 }, { "epoch": 0.7967089475488516, "grad_norm": 0.21071153254109745, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115560306.7586207, "logits/rejected": 93053630.17142858, "logps/chosen": -195.58620689655172, "logps/rejected": -391.77142857142854, "loss": 0.1792, "rewards/chosen": 0.7271012931034483, "rewards/margins": 9.519958435960591, "rewards/rejected": -8.792857142857143, "step": 1162 }, { "epoch": 0.7973945834761742, "grad_norm": 0.19557852183997504, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109173478.0289855, "logits/rejected": 104786510.10169491, "logps/chosen": -199.42028985507247, "logps/rejected": -379.3898305084746, "loss": 0.1738, "rewards/chosen": 1.3460144927536233, "rewards/margins": 9.473133136821419, "rewards/rejected": -8.127118644067796, "step": 1163 }, { "epoch": 0.7980802194034967, "grad_norm": 0.19712686256858256, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83003944.63492064, "logits/rejected": 80530636.8, "logps/chosen": -205.71428571428572, "logps/rejected": -376.12307692307695, "loss": 0.1553, "rewards/chosen": 1.5436507936507937, "rewards/margins": 7.851343101343101, "rewards/rejected": -6.3076923076923075, "step": 1164 }, { "epoch": 0.7987658553308193, "grad_norm": 0.20461614757940608, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93544794.14084508, "logits/rejected": 87970007.57894737, "logps/chosen": -237.74647887323943, "logps/rejected": -371.0877192982456, "loss": 0.1559, "rewards/chosen": 1.8661971830985915, "rewards/margins": 10.120583148010873, "rewards/rejected": -8.25438596491228, "step": 1165 }, { "epoch": 0.799451491258142, "grad_norm": 0.20395981065546717, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153354240.0, "logits/rejected": 59834368.0, "logps/chosen": -304.25, "logps/rejected": -372.0, "loss": 0.165, "rewards/chosen": 1.9814453125, "rewards/margins": 10.0751953125, "rewards/rejected": -8.09375, "step": 1166 }, { "epoch": 0.8001371271854645, "grad_norm": 0.18532039497383873, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137663049.14285713, "logits/rejected": 52545308.44444445, "logps/chosen": -286.85714285714283, "logps/rejected": -356.0, "loss": 0.1389, "rewards/chosen": 1.8392857142857142, "rewards/margins": 9.846230158730158, "rewards/rejected": -8.006944444444445, "step": 1167 }, { "epoch": 0.8008227631127871, "grad_norm": 0.17218921056351158, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135732337.7777778, "logits/rejected": 98079305.14285715, "logps/chosen": -263.77777777777777, "logps/rejected": -397.14285714285717, "loss": 0.1802, "rewards/chosen": 1.4618055555555556, "rewards/margins": 8.1046626984127, "rewards/rejected": -6.642857142857143, "step": 1168 }, { "epoch": 0.8015083990401097, "grad_norm": 0.1754910050999654, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99345086.17142858, "logits/rejected": 117874405.51724137, "logps/chosen": -243.88571428571427, "logps/rejected": -390.62068965517244, "loss": 0.1678, "rewards/chosen": 1.9464285714285714, "rewards/margins": 10.256773399014778, "rewards/rejected": -8.310344827586206, "step": 1169 }, { "epoch": 0.8021940349674322, "grad_norm": 0.17787739704741218, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139250892.8, "logits/rejected": 110408884.70588236, "logps/chosen": -288.53333333333336, "logps/rejected": -417.88235294117646, "loss": 0.126, "rewards/chosen": 2.310416666666667, "rewards/margins": 10.045710784313727, "rewards/rejected": -7.735294117647059, "step": 1170 }, { "epoch": 0.8028796708947549, "grad_norm": 0.1874877507105764, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122383798.85714285, "logits/rejected": 63094784.0, "logps/chosen": -229.71428571428572, "logps/rejected": -370.22222222222223, "loss": 0.1631, "rewards/chosen": 1.4252232142857142, "rewards/margins": 10.876612103174603, "rewards/rejected": -9.45138888888889, "step": 1171 }, { "epoch": 0.8035653068220775, "grad_norm": 0.1922028556363413, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110690304.0, "logits/rejected": 98893824.0, "logps/chosen": -252.5, "logps/rejected": -421.0, "loss": 0.173, "rewards/chosen": 0.986328125, "rewards/margins": 10.291015625, "rewards/rejected": -9.3046875, "step": 1172 }, { "epoch": 0.8042509427494001, "grad_norm": 0.21082997447308208, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81149118.91525424, "logits/rejected": 134217728.0, "logps/chosen": -193.35593220338984, "logps/rejected": -416.463768115942, "loss": 0.1752, "rewards/chosen": 1.3919491525423728, "rewards/margins": 8.935427413411938, "rewards/rejected": -7.543478260869565, "step": 1173 }, { "epoch": 0.8049365786767226, "grad_norm": 0.1654842069267559, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89582821.25373134, "logits/rejected": 92756001.5737705, "logps/chosen": -181.8507462686567, "logps/rejected": -392.39344262295083, "loss": 0.1584, "rewards/chosen": 1.748134328358209, "rewards/margins": 8.756331049669685, "rewards/rejected": -7.008196721311475, "step": 1174 }, { "epoch": 0.8056222146040453, "grad_norm": 0.18212382604111427, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132052925.93548387, "logits/rejected": 34444132.84848485, "logps/chosen": -259.0967741935484, "logps/rejected": -325.3333333333333, "loss": 0.1548, "rewards/chosen": 1.090725806451613, "rewards/margins": 9.613453079178885, "rewards/rejected": -8.522727272727273, "step": 1175 }, { "epoch": 0.8063078505313679, "grad_norm": 0.16241327573257253, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 46187276.190476194, "logits/rejected": 139379948.30769232, "logps/chosen": -175.23809523809524, "logps/rejected": -421.9076923076923, "loss": 0.1819, "rewards/chosen": 0.8462301587301587, "rewards/margins": 8.93853785103785, "rewards/rejected": -8.092307692307692, "step": 1176 }, { "epoch": 0.8069934864586904, "grad_norm": 0.21278452916622553, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 45671310.222222224, "logits/rejected": 125529526.85714285, "logps/chosen": -273.3333333333333, "logps/rejected": -352.85714285714283, "loss": 0.2086, "rewards/chosen": 1.1423611111111112, "rewards/margins": 7.838789682539682, "rewards/rejected": -6.696428571428571, "step": 1177 }, { "epoch": 0.807679122386013, "grad_norm": 0.17638261455645157, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86155129.70491803, "logits/rejected": 114060028.17910448, "logps/chosen": -313.7049180327869, "logps/rejected": -363.46268656716416, "loss": 0.1475, "rewards/chosen": 1.8186475409836065, "rewards/margins": 9.333572914117935, "rewards/rejected": -7.514925373134329, "step": 1178 }, { "epoch": 0.8083647583133357, "grad_norm": 0.16633845106741055, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112813462.34920634, "logits/rejected": 44911316.676923074, "logps/chosen": -237.20634920634922, "logps/rejected": -296.8615384615385, "loss": 0.1573, "rewards/chosen": 1.5436507936507937, "rewards/margins": 10.05903540903541, "rewards/rejected": -8.515384615384615, "step": 1179 }, { "epoch": 0.8090503942406582, "grad_norm": 0.1352002278036155, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93847552.0, "logits/rejected": 79866538.66666667, "logps/chosen": -266.5882352941176, "logps/rejected": -412.8, "loss": 0.1632, "rewards/chosen": 2.1121323529411766, "rewards/margins": 9.128799019607843, "rewards/rejected": -7.016666666666667, "step": 1180 }, { "epoch": 0.8097360301679808, "grad_norm": 0.22230179237669617, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132017437.37704918, "logits/rejected": 78314541.85074627, "logps/chosen": -206.4262295081967, "logps/rejected": -359.64179104477614, "loss": 0.1705, "rewards/chosen": 1.3217213114754098, "rewards/margins": 7.590378027893321, "rewards/rejected": -6.268656716417911, "step": 1181 }, { "epoch": 0.8104216660953034, "grad_norm": 0.1946037000395857, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143099783.52941176, "logits/rejected": 112756872.53333333, "logps/chosen": -271.52941176470586, "logps/rejected": -388.8, "loss": 0.1731, "rewards/chosen": 1.640625, "rewards/margins": -19119697.559375, "rewards/rejected": 19119699.2, "step": 1182 }, { "epoch": 0.8111073020226259, "grad_norm": 0.17683964290956916, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114881275.66101696, "logits/rejected": 85770477.44927536, "logps/chosen": -235.66101694915255, "logps/rejected": -382.1449275362319, "loss": 0.1514, "rewards/chosen": 1.75, "rewards/margins": 10.090579710144928, "rewards/rejected": -8.340579710144928, "step": 1183 }, { "epoch": 0.8117929379499486, "grad_norm": 0.30080332589212266, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106916621.96363637, "logits/rejected": 67080135.89041096, "logps/chosen": -226.3272727272727, "logps/rejected": -339.28767123287673, "loss": 0.1599, "rewards/chosen": 1.0704545454545455, "rewards/margins": 9.960865504358654, "rewards/rejected": -8.89041095890411, "step": 1184 }, { "epoch": 0.8124785738772712, "grad_norm": 0.20262072098915412, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81450677.67741935, "logits/rejected": 102887548.12121212, "logps/chosen": -215.48387096774192, "logps/rejected": -338.42424242424244, "loss": 0.1718, "rewards/chosen": 1.4939516129032258, "rewards/margins": 10.100012218963832, "rewards/rejected": -8.606060606060606, "step": 1185 }, { "epoch": 0.8131642098045938, "grad_norm": 0.18675892795656907, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88489328.64, "logits/rejected": 59499966.35897436, "logps/chosen": -192.0, "logps/rejected": -316.71794871794873, "loss": 0.1537, "rewards/chosen": 1.693125, "rewards/margins": 8.60338141025641, "rewards/rejected": -6.910256410256411, "step": 1186 }, { "epoch": 0.8138498457319163, "grad_norm": 0.1472642293877631, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127754374.29508197, "logits/rejected": 88769001.07462686, "logps/chosen": -258.88524590163934, "logps/rejected": -402.6268656716418, "loss": 0.1665, "rewards/chosen": 1.6331967213114753, "rewards/margins": 10.47648030340103, "rewards/rejected": -8.843283582089553, "step": 1187 }, { "epoch": 0.814535481659239, "grad_norm": 0.1909280387268166, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90780467.2, "logits/rejected": 49067188.705882356, "logps/chosen": -190.26666666666668, "logps/rejected": -366.8235294117647, "loss": 0.1497, "rewards/chosen": 1.1645833333333333, "rewards/margins": 10.311642156862744, "rewards/rejected": -9.147058823529411, "step": 1188 }, { "epoch": 0.8152211175865616, "grad_norm": 0.16318896735169652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104071168.0, "logits/rejected": 97255424.0, "logps/chosen": -230.5, "logps/rejected": -371.5, "loss": 0.144, "rewards/chosen": 2.2265625, "rewards/margins": 10.7421875, "rewards/rejected": -8.515625, "step": 1189 }, { "epoch": 0.8159067535138841, "grad_norm": 0.1752099636544609, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92084037.81818181, "logits/rejected": 32235255.741935484, "logps/chosen": -213.0909090909091, "logps/rejected": -331.8709677419355, "loss": 0.1581, "rewards/chosen": 1.2651515151515151, "rewards/margins": 9.878054740957968, "rewards/rejected": -8.612903225806452, "step": 1190 }, { "epoch": 0.8165923894412067, "grad_norm": 0.19374123189929468, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103747343.05882353, "logits/rejected": 57147392.0, "logps/chosen": -234.35294117647058, "logps/rejected": -326.6666666666667, "loss": 0.1665, "rewards/chosen": 1.3547794117647058, "rewards/margins": 9.071446078431373, "rewards/rejected": -7.716666666666667, "step": 1191 }, { "epoch": 0.8172780253685293, "grad_norm": 0.212273549158556, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131308775.22580644, "logits/rejected": 101743647.03030303, "logps/chosen": -247.74193548387098, "logps/rejected": -368.4848484848485, "loss": 0.2288, "rewards/chosen": 0.7600806451612904, "rewards/margins": 5.22977761485826, "rewards/rejected": -4.46969696969697, "step": 1192 }, { "epoch": 0.8179636612958519, "grad_norm": 0.2045972876824113, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119318558.56716418, "logits/rejected": 101557164.06557377, "logps/chosen": -305.910447761194, "logps/rejected": -410.75409836065575, "loss": 0.1734, "rewards/chosen": 1.6194029850746268, "rewards/margins": 7.652189870320528, "rewards/rejected": -6.032786885245901, "step": 1193 }, { "epoch": 0.8186492972231745, "grad_norm": 0.17809060600602822, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125700064.4923077, "logits/rejected": 36333990.603174604, "logps/chosen": -204.55384615384617, "logps/rejected": -354.031746031746, "loss": 0.1738, "rewards/chosen": 1.623076923076923, "rewards/margins": 9.94053724053724, "rewards/rejected": -8.317460317460318, "step": 1194 }, { "epoch": 0.8193349331504971, "grad_norm": 0.1723859284744899, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127417871.51515152, "logits/rejected": 74381245.93548387, "logps/chosen": -265.6969696969697, "logps/rejected": -328.258064516129, "loss": 0.1703, "rewards/chosen": 1.4678030303030303, "rewards/margins": 9.435544965786901, "rewards/rejected": -7.967741935483871, "step": 1195 }, { "epoch": 0.8200205690778197, "grad_norm": 0.1809377307659798, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127926272.0, "logits/rejected": 52487054.222222224, "logps/chosen": -235.71428571428572, "logps/rejected": -373.55555555555554, "loss": 0.1744, "rewards/chosen": 1.1689453125, "rewards/margins": 7.842556423611111, "rewards/rejected": -6.673611111111111, "step": 1196 }, { "epoch": 0.8207062050051422, "grad_norm": 0.21436967606609086, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126507610.35294117, "logits/rejected": 70184686.93333334, "logps/chosen": -256.70588235294116, "logps/rejected": -411.2, "loss": 0.1948, "rewards/chosen": 1.4237132352941178, "rewards/margins": 11.457046568627451, "rewards/rejected": -10.033333333333333, "step": 1197 }, { "epoch": 0.8213918409324649, "grad_norm": 0.19420577993931318, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145631735.60655737, "logits/rejected": 40409301.97014926, "logps/chosen": -290.75409836065575, "logps/rejected": -343.4029850746269, "loss": 0.1372, "rewards/chosen": 2.1598360655737703, "rewards/margins": 10.965806214827502, "rewards/rejected": -8.805970149253731, "step": 1198 }, { "epoch": 0.8220774768597875, "grad_norm": 0.18435466189336705, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93368854.26086956, "logits/rejected": 63305554.44067796, "logps/chosen": -259.4782608695652, "logps/rejected": -360.6779661016949, "loss": 0.1902, "rewards/chosen": 1.5842391304347827, "rewards/margins": 9.380849299926307, "rewards/rejected": -7.796610169491525, "step": 1199 }, { "epoch": 0.82276311278711, "grad_norm": 0.21859931139591907, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149993319.16417912, "logits/rejected": 57070038.03278688, "logps/chosen": -293.4925373134328, "logps/rejected": -336.0, "loss": 0.1589, "rewards/chosen": 1.7537313432835822, "rewards/margins": 10.040616589185223, "rewards/rejected": -8.28688524590164, "step": 1200 }, { "epoch": 0.8234487487144326, "grad_norm": 0.18757455422288682, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82360878.54545455, "logits/rejected": 39152474.838709675, "logps/chosen": -248.0, "logps/rejected": -336.258064516129, "loss": 0.1807, "rewards/chosen": 1.7159090909090908, "rewards/margins": 9.990102639296186, "rewards/rejected": -8.274193548387096, "step": 1201 }, { "epoch": 0.8241343846417553, "grad_norm": 0.21727793149721542, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113369569.88235295, "logits/rejected": 60502835.2, "logps/chosen": -203.52941176470588, "logps/rejected": -373.06666666666666, "loss": 0.1848, "rewards/chosen": 1.34375, "rewards/margins": 10.510416666666666, "rewards/rejected": -9.166666666666666, "step": 1202 }, { "epoch": 0.8248200205690778, "grad_norm": 0.2300473765005737, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143684871.31428573, "logits/rejected": 69893014.06896552, "logps/chosen": -249.37142857142857, "logps/rejected": -389.7931034482759, "loss": 0.1816, "rewards/chosen": 1.363392857142857, "rewards/margins": 10.199599753694581, "rewards/rejected": -8.836206896551724, "step": 1203 }, { "epoch": 0.8255056564964004, "grad_norm": 0.19863928283558546, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93178076.55384615, "logits/rejected": 111065835.68253969, "logps/chosen": -220.06153846153848, "logps/rejected": -406.3492063492063, "loss": 0.1822, "rewards/chosen": 1.0961538461538463, "rewards/margins": 11.286630036630036, "rewards/rejected": -10.19047619047619, "step": 1204 }, { "epoch": 0.826191292423723, "grad_norm": 0.19477328467799465, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118102770.5263158, "logits/rejected": 116875894.15384616, "logps/chosen": -250.52631578947367, "logps/rejected": -381.2307692307692, "loss": 0.1722, "rewards/chosen": 2.0328947368421053, "rewards/margins": 10.66751012145749, "rewards/rejected": -8.634615384615385, "step": 1205 }, { "epoch": 0.8268769283510456, "grad_norm": 0.2787420394102272, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129133723.15151516, "logits/rejected": 62068934.19354839, "logps/chosen": -252.36363636363637, "logps/rejected": -347.35483870967744, "loss": 0.1872, "rewards/chosen": 1.6496212121212122, "rewards/margins": 8.657685728250245, "rewards/rejected": -7.008064516129032, "step": 1206 }, { "epoch": 0.8275625642783682, "grad_norm": 0.26090584280288825, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 171092650.66666666, "logits/rejected": 46538270.11764706, "logps/chosen": -280.26666666666665, "logps/rejected": -338.3529411764706, "loss": 0.162, "rewards/chosen": 1.3979166666666667, "rewards/margins": 9.669975490196078, "rewards/rejected": -8.272058823529411, "step": 1207 }, { "epoch": 0.8282482002056908, "grad_norm": 0.20078389068714517, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77220132.57142857, "logits/rejected": 76495075.55555555, "logps/chosen": -229.42857142857142, "logps/rejected": -387.55555555555554, "loss": 0.1522, "rewards/chosen": 1.8872767857142858, "rewards/margins": 9.894221230158731, "rewards/rejected": -8.006944444444445, "step": 1208 }, { "epoch": 0.8289338361330134, "grad_norm": 0.21152269487801983, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105793828.57142857, "logits/rejected": 132382720.0, "logps/chosen": -268.57142857142856, "logps/rejected": -413.77777777777777, "loss": 0.1412, "rewards/chosen": 1.4732142857142858, "rewards/margins": 9.750992063492065, "rewards/rejected": -8.277777777777779, "step": 1209 }, { "epoch": 0.8296194720603359, "grad_norm": 0.16553802687564761, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 170428552.53333333, "logits/rejected": 57178232.47058824, "logps/chosen": -192.53333333333333, "logps/rejected": -438.11764705882354, "loss": 0.1585, "rewards/chosen": 1.0770833333333334, "rewards/margins": 10.805024509803921, "rewards/rejected": -9.727941176470589, "step": 1210 }, { "epoch": 0.8303051079876586, "grad_norm": 0.18804502285149896, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99447555.71014492, "logits/rejected": 114312556.47457626, "logps/chosen": -192.69565217391303, "logps/rejected": -392.6779661016949, "loss": 0.17, "rewards/chosen": 1.2554347826086956, "rewards/margins": 5.865604274134119, "rewards/rejected": -4.610169491525424, "step": 1211 }, { "epoch": 0.8309907439149812, "grad_norm": 0.29836046024419366, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130722474.66666667, "logits/rejected": 86846765.1764706, "logps/chosen": -301.6, "logps/rejected": -383.52941176470586, "loss": 0.1419, "rewards/chosen": 1.3770833333333334, "rewards/margins": 9.957965686274509, "rewards/rejected": -8.580882352941176, "step": 1212 }, { "epoch": 0.8316763798423037, "grad_norm": 0.17048209936632241, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92525094.20895523, "logits/rejected": 109258181.24590164, "logps/chosen": -183.88059701492537, "logps/rejected": -402.3606557377049, "loss": 0.1415, "rewards/chosen": 1.8451492537313432, "rewards/margins": 8.222198434059212, "rewards/rejected": -6.377049180327869, "step": 1213 }, { "epoch": 0.8323620157696263, "grad_norm": 0.18109932083195696, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139168381.90163934, "logits/rejected": 63916184.8358209, "logps/chosen": -266.62295081967216, "logps/rejected": -351.2835820895522, "loss": 0.1736, "rewards/chosen": 0.9405737704918032, "rewards/margins": 8.99281257646195, "rewards/rejected": -8.052238805970148, "step": 1214 }, { "epoch": 0.8330476516969489, "grad_norm": 0.17805911080659778, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87516970.02985075, "logits/rejected": 77188515.67213115, "logps/chosen": -218.50746268656715, "logps/rejected": -377.1803278688525, "loss": 0.1504, "rewards/chosen": 1.6977611940298507, "rewards/margins": 9.87808906288231, "rewards/rejected": -8.180327868852459, "step": 1215 }, { "epoch": 0.8337332876242715, "grad_norm": 0.1938856787304735, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124326682.74626866, "logits/rejected": 43627637.50819672, "logps/chosen": -234.98507462686567, "logps/rejected": -420.72131147540983, "loss": 0.1689, "rewards/chosen": 1.757462686567164, "rewards/margins": 10.58533153902618, "rewards/rejected": -8.827868852459016, "step": 1216 }, { "epoch": 0.8344189235515941, "grad_norm": 0.24025919145473182, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114204906.05714285, "logits/rejected": 79257882.48275863, "logps/chosen": -245.25714285714287, "logps/rejected": -425.37931034482756, "loss": 0.1701, "rewards/chosen": 1.9446428571428571, "rewards/margins": 8.461884236453201, "rewards/rejected": -6.517241379310345, "step": 1217 }, { "epoch": 0.8351045594789167, "grad_norm": 0.16090008368648465, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81669090.74285714, "logits/rejected": 94154893.2413793, "logps/chosen": -271.54285714285714, "logps/rejected": -312.0, "loss": 0.1743, "rewards/chosen": 2.055357142857143, "rewards/margins": 9.063977832512315, "rewards/rejected": -7.008620689655173, "step": 1218 }, { "epoch": 0.8357901954062393, "grad_norm": 0.16523826997099636, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126288769.75342466, "logits/rejected": 70921867.63636364, "logps/chosen": -290.1917808219178, "logps/rejected": -411.92727272727274, "loss": 0.2075, "rewards/chosen": 1.7705479452054795, "rewards/margins": 11.543275217932752, "rewards/rejected": -9.772727272727273, "step": 1219 }, { "epoch": 0.8364758313335618, "grad_norm": 0.16136778395241527, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138179015.1111111, "logits/rejected": 46080664.216216214, "logps/chosen": -220.74074074074073, "logps/rejected": -304.86486486486484, "loss": 0.1357, "rewards/chosen": 1.8055555555555556, "rewards/margins": 10.832582582582582, "rewards/rejected": -9.027027027027026, "step": 1220 }, { "epoch": 0.8371614672608845, "grad_norm": 0.19227783243144675, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 41402368.0, "logits/rejected": 92143616.0, "logps/chosen": -179.5, "logps/rejected": -435.0, "loss": 0.1706, "rewards/chosen": 0.9638671875, "rewards/margins": 11.0263671875, "rewards/rejected": -10.0625, "step": 1221 }, { "epoch": 0.8378471031882071, "grad_norm": 0.17591038561098066, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82138453.33333333, "logits/rejected": 110999259.42857143, "logps/chosen": -244.0, "logps/rejected": -408.0, "loss": 0.1801, "rewards/chosen": 1.4670138888888888, "rewards/margins": 10.315228174603176, "rewards/rejected": -8.848214285714286, "step": 1222 }, { "epoch": 0.8385327391155296, "grad_norm": 0.2526172971268347, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115343360.0, "logits/rejected": 68943872.0, "logps/chosen": -249.5, "logps/rejected": -347.5, "loss": 0.1469, "rewards/chosen": 2.009765625, "rewards/margins": 11.712890625, "rewards/rejected": -9.703125, "step": 1223 }, { "epoch": 0.8392183750428522, "grad_norm": 0.2075747039808128, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116472595.6923077, "logits/rejected": 69239304.12698413, "logps/chosen": -287.0153846153846, "logps/rejected": -381.968253968254, "loss": 0.148, "rewards/chosen": 2.373076923076923, "rewards/margins": 10.142918192918193, "rewards/rejected": -7.76984126984127, "step": 1224 }, { "epoch": 0.8399040109701749, "grad_norm": 0.16744181744950035, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128779350.77966101, "logits/rejected": 35803551.53623188, "logps/chosen": -231.05084745762713, "logps/rejected": -354.7826086956522, "loss": 0.1507, "rewards/chosen": 1.3347457627118644, "rewards/margins": 10.537644313436502, "rewards/rejected": -9.202898550724637, "step": 1225 }, { "epoch": 0.8405896468974974, "grad_norm": 0.17439281217214544, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77260601.80645162, "logits/rejected": 77078279.75757575, "logps/chosen": -197.93548387096774, "logps/rejected": -374.3030303030303, "loss": 0.144, "rewards/chosen": 1.1491935483870968, "rewards/margins": 9.906769305962854, "rewards/rejected": -8.757575757575758, "step": 1226 }, { "epoch": 0.84127528282482, "grad_norm": 0.18083164248269393, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130236693.69491525, "logits/rejected": 66774535.42028985, "logps/chosen": -231.05084745762713, "logps/rejected": -401.15942028985506, "loss": 0.1584, "rewards/chosen": 1.6398305084745763, "rewards/margins": 10.900700073691967, "rewards/rejected": -9.26086956521739, "step": 1227 }, { "epoch": 0.8419609187521426, "grad_norm": 0.18183441590940652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132567510.03278689, "logits/rejected": 55590178.3880597, "logps/chosen": -258.3606557377049, "logps/rejected": -428.4179104477612, "loss": 0.1828, "rewards/chosen": 1.283811475409836, "rewards/margins": 9.447990579887447, "rewards/rejected": -8.164179104477611, "step": 1228 }, { "epoch": 0.8426465546794653, "grad_norm": 0.1837678946146617, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89353654.85714285, "logits/rejected": 61982492.44444445, "logps/chosen": -169.85714285714286, "logps/rejected": -336.0, "loss": 0.1505, "rewards/chosen": 1.1456473214285714, "rewards/margins": 10.42342509920635, "rewards/rejected": -9.277777777777779, "step": 1229 }, { "epoch": 0.8433321906067878, "grad_norm": 0.21447623110886335, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144834560.0, "logits/rejected": 58228736.0, "logps/chosen": -258.0, "logps/rejected": -370.75, "loss": 0.1871, "rewards/chosen": 1.490234375, "rewards/margins": 10.912109375, "rewards/rejected": -9.421875, "step": 1230 }, { "epoch": 0.8440178265341104, "grad_norm": 0.2009016937311084, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124022342.8923077, "logits/rejected": 74698556.95238096, "logps/chosen": -261.16923076923075, "logps/rejected": -349.968253968254, "loss": 0.1579, "rewards/chosen": 1.4019230769230768, "rewards/margins": 10.481288156288155, "rewards/rejected": -9.079365079365079, "step": 1231 }, { "epoch": 0.844703462461433, "grad_norm": 0.1657150766442941, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150016273.06666666, "logits/rejected": 94371840.0, "logps/chosen": -261.6, "logps/rejected": -437.1764705882353, "loss": 0.1563, "rewards/chosen": 1.7875, "rewards/margins": 10.16985294117647, "rewards/rejected": -8.382352941176471, "step": 1232 }, { "epoch": 0.8453890983887555, "grad_norm": 0.21438433814932428, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151134754.13333333, "logits/rejected": 36885202.823529415, "logps/chosen": -295.2, "logps/rejected": -345.88235294117646, "loss": 0.1577, "rewards/chosen": 2.0072916666666667, "rewards/margins": 11.110232843137256, "rewards/rejected": -9.102941176470589, "step": 1233 }, { "epoch": 0.8460747343160782, "grad_norm": 0.1824659929094045, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89578349.71428572, "logits/rejected": 81391192.27586207, "logps/chosen": -179.42857142857142, "logps/rejected": -454.0689655172414, "loss": 0.1857, "rewards/chosen": 1.4348214285714285, "rewards/margins": 10.934821428571428, "rewards/rejected": -9.5, "step": 1234 }, { "epoch": 0.8467603702434008, "grad_norm": 0.2032892838961469, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147486980.65454546, "logits/rejected": 81013269.0410959, "logps/chosen": -304.8727272727273, "logps/rejected": -336.6575342465753, "loss": 0.1312, "rewards/chosen": 2.172727272727273, "rewards/margins": 8.282316313823163, "rewards/rejected": -6.109589041095891, "step": 1235 }, { "epoch": 0.8474460061707233, "grad_norm": 0.1596502559568051, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70512438.55737706, "logits/rejected": 103417764.29850747, "logps/chosen": -188.85245901639345, "logps/rejected": -353.910447761194, "loss": 0.1321, "rewards/chosen": 1.1065573770491803, "rewards/margins": 9.449840959138733, "rewards/rejected": -8.343283582089553, "step": 1236 }, { "epoch": 0.8481316420980459, "grad_norm": 0.19360719845439753, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139000706.24561402, "logits/rejected": 75379322.59154929, "logps/chosen": -272.56140350877195, "logps/rejected": -412.84507042253523, "loss": 0.1565, "rewards/chosen": 1.9473684210526316, "rewards/margins": 9.074128984432914, "rewards/rejected": -7.126760563380282, "step": 1237 }, { "epoch": 0.8488172780253685, "grad_norm": 0.16791499305270347, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115152709.81818181, "logits/rejected": 91470300.93150684, "logps/chosen": -239.70909090909092, "logps/rejected": -424.7671232876712, "loss": 0.1447, "rewards/chosen": 1.0767045454545454, "rewards/margins": 9.775334682440846, "rewards/rejected": -8.698630136986301, "step": 1238 }, { "epoch": 0.8495029139526912, "grad_norm": 0.18702747329121877, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86486548.48, "logits/rejected": 120559353.43589744, "logps/chosen": -195.84, "logps/rejected": -396.3076923076923, "loss": 0.1276, "rewards/chosen": 1.805, "rewards/margins": 8.702435897435898, "rewards/rejected": -6.897435897435898, "step": 1239 }, { "epoch": 0.8501885498800137, "grad_norm": 0.1749592245813485, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103921947.56923077, "logits/rejected": 99198618.41269842, "logps/chosen": -304.0, "logps/rejected": -377.6507936507937, "loss": 0.1641, "rewards/chosen": 1.7903846153846155, "rewards/margins": 10.782448107448108, "rewards/rejected": -8.992063492063492, "step": 1240 }, { "epoch": 0.8508741858073363, "grad_norm": 0.2354579661004872, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123911723.88571429, "logits/rejected": 29992889.379310343, "logps/chosen": -216.9142857142857, "logps/rejected": -336.0, "loss": 0.1851, "rewards/chosen": 1.0366071428571428, "rewards/margins": 7.734883004926108, "rewards/rejected": -6.698275862068965, "step": 1241 }, { "epoch": 0.8515598217346589, "grad_norm": 0.20136619018210047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76416771.50684932, "logits/rejected": 83275999.41818182, "logps/chosen": -218.08219178082192, "logps/rejected": -367.7090909090909, "loss": 0.1848, "rewards/chosen": 1.6164383561643836, "rewards/margins": 8.507347447073474, "rewards/rejected": -6.890909090909091, "step": 1242 }, { "epoch": 0.8522454576619815, "grad_norm": 0.17664183955623042, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113046479.23809524, "logits/rejected": 93178076.55384615, "logps/chosen": -265.9047619047619, "logps/rejected": -363.81538461538463, "loss": 0.1641, "rewards/chosen": 1.7261904761904763, "rewards/margins": 9.403113553113553, "rewards/rejected": -7.676923076923077, "step": 1243 }, { "epoch": 0.8529310935893041, "grad_norm": 0.20937040606715765, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116493411.09677419, "logits/rejected": 100663296.0, "logps/chosen": -280.7741935483871, "logps/rejected": -437.3333333333333, "loss": 0.1778, "rewards/chosen": 1.6754032258064515, "rewards/margins": 9.455706256109481, "rewards/rejected": -7.78030303030303, "step": 1244 }, { "epoch": 0.8536167295166267, "grad_norm": 0.19071365217396988, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119635968.0, "logits/rejected": 96600064.0, "logps/chosen": -249.25, "logps/rejected": -444.0, "loss": 0.1804, "rewards/chosen": 1.548828125, "rewards/margins": 10.861328125, "rewards/rejected": -9.3125, "step": 1245 }, { "epoch": 0.8543023654439492, "grad_norm": 0.18777457524124883, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79575267.55555555, "logits/rejected": 121485019.42857143, "logps/chosen": -195.33333333333334, "logps/rejected": -401.14285714285717, "loss": 0.1522, "rewards/chosen": 2.046875, "rewards/margins": 8.171875, "rewards/rejected": -6.125, "step": 1246 }, { "epoch": 0.8549880013712718, "grad_norm": 0.16877730755211426, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148424241.5483871, "logits/rejected": 68888265.6969697, "logps/chosen": -237.93548387096774, "logps/rejected": -392.72727272727275, "loss": 0.1646, "rewards/chosen": 1.9274193548387097, "rewards/margins": 8.950146627565982, "rewards/rejected": -7.0227272727272725, "step": 1247 }, { "epoch": 0.8556736372985945, "grad_norm": 0.2135985688852188, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 39769627.92727273, "logits/rejected": 164209874.41095892, "logps/chosen": -218.47272727272727, "logps/rejected": -396.2739726027397, "loss": 0.1356, "rewards/chosen": 1.1147727272727272, "rewards/margins": 10.88189601494396, "rewards/rejected": -9.767123287671232, "step": 1248 }, { "epoch": 0.856359273225917, "grad_norm": 0.19374994791788397, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 188373594.3529412, "logits/rejected": 40789606.4, "logps/chosen": -326.5882352941176, "logps/rejected": -340.0, "loss": 0.18, "rewards/chosen": 1.8483455882352942, "rewards/margins": 10.456678921568626, "rewards/rejected": -8.608333333333333, "step": 1249 }, { "epoch": 0.8570449091532396, "grad_norm": 0.1711809743293962, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136251329.93939394, "logits/rejected": 53189863.22580645, "logps/chosen": -188.36363636363637, "logps/rejected": -310.96774193548384, "loss": 0.1558, "rewards/chosen": 1.7159090909090908, "rewards/margins": 9.53042521994135, "rewards/rejected": -7.814516129032258, "step": 1250 }, { "epoch": 0.8577305450805622, "grad_norm": 0.19899396764015828, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103702646.72463769, "logits/rejected": 101516374.77966101, "logps/chosen": -211.94202898550725, "logps/rejected": -429.0169491525424, "loss": 0.1509, "rewards/chosen": 1.6340579710144927, "rewards/margins": 10.828973225251781, "rewards/rejected": -9.194915254237289, "step": 1251 }, { "epoch": 0.8584161810078849, "grad_norm": 0.16920477355621824, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95029156.29850747, "logits/rejected": 116477884.85245901, "logps/chosen": -210.38805970149255, "logps/rejected": -380.8524590163934, "loss": 0.1508, "rewards/chosen": 1.8507462686567164, "rewards/margins": 10.727795448984587, "rewards/rejected": -8.87704918032787, "step": 1252 }, { "epoch": 0.8591018169352074, "grad_norm": 0.1774285392462998, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114936449.91044776, "logits/rejected": 83886080.0, "logps/chosen": -225.91044776119404, "logps/rejected": -392.39344262295083, "loss": 0.1449, "rewards/chosen": 2.281716417910448, "rewards/margins": 11.494831172008809, "rewards/rejected": -9.21311475409836, "step": 1253 }, { "epoch": 0.85978745286253, "grad_norm": 0.19145304901856966, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132513792.0, "logits/rejected": 117702656.0, "logps/chosen": -265.5, "logps/rejected": -481.0, "loss": 0.1708, "rewards/chosen": 1.5751953125, "rewards/margins": 10.3017578125, "rewards/rejected": -8.7265625, "step": 1254 }, { "epoch": 0.8604730887898526, "grad_norm": 0.21849672104947312, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120194980.29850747, "logits/rejected": 85398779.80327868, "logps/chosen": -225.43283582089552, "logps/rejected": -396.0655737704918, "loss": 0.1987, "rewards/chosen": 1.287313432835821, "rewards/margins": 8.680756055786642, "rewards/rejected": -7.39344262295082, "step": 1255 }, { "epoch": 0.8611587247171751, "grad_norm": 0.20961511442772385, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129055507.6923077, "logits/rejected": 67329616.84210527, "logps/chosen": -219.3846153846154, "logps/rejected": -350.7368421052632, "loss": 0.136, "rewards/chosen": 1.5600961538461537, "rewards/margins": 7.389043522267206, "rewards/rejected": -5.828947368421052, "step": 1256 }, { "epoch": 0.8618443606444978, "grad_norm": 0.1555882536713128, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158613169.63265306, "logits/rejected": 58587524.8607595, "logps/chosen": -153.55102040816325, "logps/rejected": -418.8354430379747, "loss": 0.1345, "rewards/chosen": 1.3246173469387754, "rewards/margins": 11.387908486179281, "rewards/rejected": -10.063291139240507, "step": 1257 }, { "epoch": 0.8625299965718204, "grad_norm": 0.17329158281160956, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106364928.0, "logits/rejected": 107085824.0, "logps/chosen": -214.0, "logps/rejected": -437.0, "loss": 0.1564, "rewards/chosen": 2.080078125, "rewards/margins": 10.908203125, "rewards/rejected": -8.828125, "step": 1258 }, { "epoch": 0.8632156324991429, "grad_norm": 0.17193614268503984, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144284057.6, "logits/rejected": 68717638.1369863, "logps/chosen": -252.8, "logps/rejected": -368.6575342465753, "loss": 0.1563, "rewards/chosen": 1.8494318181818181, "rewards/margins": 10.363130448318804, "rewards/rejected": -8.513698630136986, "step": 1259 }, { "epoch": 0.8639012684264655, "grad_norm": 0.20512344569292426, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90095616.0, "logits/rejected": 90963968.0, "logps/chosen": -220.25, "logps/rejected": -400.0, "loss": 0.1453, "rewards/chosen": 1.796875, "rewards/margins": 11.3359375, "rewards/rejected": -9.5390625, "step": 1260 }, { "epoch": 0.8645869043537882, "grad_norm": 0.18374034181658153, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74757300.70588236, "logits/rejected": 113595733.33333333, "logps/chosen": -196.47058823529412, "logps/rejected": -354.93333333333334, "loss": 0.2004, "rewards/chosen": 0.7610294117647058, "rewards/margins": 10.327696078431373, "rewards/rejected": -9.566666666666666, "step": 1261 }, { "epoch": 0.8652725402811108, "grad_norm": 0.22260005688620144, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90865664.0, "logits/rejected": 117571584.0, "logps/chosen": -239.5, "logps/rejected": -387.0, "loss": 0.1671, "rewards/chosen": 1.67578125, "rewards/margins": 8.08203125, "rewards/rejected": -6.40625, "step": 1262 }, { "epoch": 0.8659581762084333, "grad_norm": 0.24575606802990538, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128498222.54545455, "logits/rejected": 66500013.41935484, "logps/chosen": -216.0, "logps/rejected": -403.35483870967744, "loss": 0.1551, "rewards/chosen": 1.7708333333333333, "rewards/margins": 10.924059139784946, "rewards/rejected": -9.153225806451612, "step": 1263 }, { "epoch": 0.8666438121357559, "grad_norm": 0.1995041614353055, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136124229.8181818, "logits/rejected": 93965939.61290322, "logps/chosen": -254.54545454545453, "logps/rejected": -382.96774193548384, "loss": 0.1795, "rewards/chosen": 2.058712121212121, "rewards/margins": 9.921615347018573, "rewards/rejected": -7.862903225806452, "step": 1264 }, { "epoch": 0.8673294480630785, "grad_norm": 0.2160339465627041, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128207978.98507462, "logits/rejected": 30529032.393442623, "logps/chosen": -228.29850746268656, "logps/rejected": -379.8032786885246, "loss": 0.1422, "rewards/chosen": 2.1305970149253732, "rewards/margins": 11.745351113286029, "rewards/rejected": -9.614754098360656, "step": 1265 }, { "epoch": 0.8680150839904011, "grad_norm": 0.19186805294427334, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152246975.0447761, "logits/rejected": 16570938.754098361, "logps/chosen": -277.4925373134328, "logps/rejected": -397.6393442622951, "loss": 0.205, "rewards/chosen": 0.9682835820895522, "rewards/margins": 9.378119647663324, "rewards/rejected": -8.40983606557377, "step": 1266 }, { "epoch": 0.8687007199177237, "grad_norm": 0.19777357096703124, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142606336.0, "logits/rejected": 69510103.04, "logps/chosen": -259.8974358974359, "logps/rejected": -384.64, "loss": 0.2101, "rewards/chosen": 1.001602564102564, "rewards/margins": 11.721602564102565, "rewards/rejected": -10.72, "step": 1267 }, { "epoch": 0.8693863558450463, "grad_norm": 0.23455413285072607, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79871755.46268657, "logits/rejected": 93237315.14754099, "logps/chosen": -191.044776119403, "logps/rejected": -377.1803278688525, "loss": 0.1587, "rewards/chosen": 1.4496268656716418, "rewards/margins": 8.015200636163446, "rewards/rejected": -6.565573770491803, "step": 1268 }, { "epoch": 0.8700719917723688, "grad_norm": 0.1777594509962022, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147965724.44444445, "logits/rejected": 58870052.571428575, "logps/chosen": -244.88888888888889, "logps/rejected": -433.14285714285717, "loss": 0.1755, "rewards/chosen": 1.53125, "rewards/margins": 11.013392857142858, "rewards/rejected": -9.482142857142858, "step": 1269 }, { "epoch": 0.8707576276996914, "grad_norm": 0.22160255818335006, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123909692.7457627, "logits/rejected": 103459498.66666667, "logps/chosen": -250.84745762711864, "logps/rejected": -415.07246376811594, "loss": 0.1251, "rewards/chosen": 2.5402542372881354, "rewards/margins": 11.648949889462047, "rewards/rejected": -9.108695652173912, "step": 1270 }, { "epoch": 0.8714432636270141, "grad_norm": 0.16256884236349853, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119969430.58823529, "logits/rejected": 71967266.13333334, "logps/chosen": -192.8235294117647, "logps/rejected": -401.6, "loss": 0.1596, "rewards/chosen": 1.8125, "rewards/margins": 10.4875, "rewards/rejected": -8.675, "step": 1271 }, { "epoch": 0.8721288995543367, "grad_norm": 0.15784294767564777, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93037288.72727273, "logits/rejected": 133405927.22580644, "logps/chosen": -277.3333333333333, "logps/rejected": -397.93548387096774, "loss": 0.1311, "rewards/chosen": 1.9517045454545454, "rewards/margins": 12.209769061583577, "rewards/rejected": -10.258064516129032, "step": 1272 }, { "epoch": 0.8728145354816592, "grad_norm": 0.16593030545350992, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96706405.43396227, "logits/rejected": 77454813.86666666, "logps/chosen": -208.9056603773585, "logps/rejected": -392.1066666666667, "loss": 0.1382, "rewards/chosen": 1.5106132075471699, "rewards/margins": 8.577279874213836, "rewards/rejected": -7.066666666666666, "step": 1273 }, { "epoch": 0.8735001714089818, "grad_norm": 0.19090793184413937, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128974848.0, "logits/rejected": 45744128.0, "logps/chosen": -210.75, "logps/rejected": -338.5, "loss": 0.1529, "rewards/chosen": 1.5986328125, "rewards/margins": 9.7548828125, "rewards/rejected": -8.15625, "step": 1274 }, { "epoch": 0.8741858073363045, "grad_norm": 0.18671977916906352, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168734787.147541, "logits/rejected": 30424354.3880597, "logps/chosen": -306.3606557377049, "logps/rejected": -359.1641791044776, "loss": 0.1929, "rewards/chosen": 1.5122950819672132, "rewards/margins": 10.32572791778811, "rewards/rejected": -8.813432835820896, "step": 1275 }, { "epoch": 0.874871443263627, "grad_norm": 0.2227463657613722, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128129222.19354838, "logits/rejected": 59895932.121212125, "logps/chosen": -219.3548387096774, "logps/rejected": -416.4848484848485, "loss": 0.1585, "rewards/chosen": 1.8588709677419355, "rewards/margins": 12.161901270772239, "rewards/rejected": -10.303030303030303, "step": 1276 }, { "epoch": 0.8755570791909496, "grad_norm": 0.17018084441256906, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95579291.15151516, "logits/rejected": 69745630.96774194, "logps/chosen": -186.9090909090909, "logps/rejected": -347.35483870967744, "loss": 0.1721, "rewards/chosen": 1.4223484848484849, "rewards/margins": 8.922348484848484, "rewards/rejected": -7.5, "step": 1277 }, { "epoch": 0.8762427151182722, "grad_norm": 0.1652980509279481, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77664529.06666666, "logits/rejected": 114973274.35294117, "logps/chosen": -211.33333333333334, "logps/rejected": -377.4117647058824, "loss": 0.1588, "rewards/chosen": 1.421875, "rewards/margins": 9.826286764705882, "rewards/rejected": -8.404411764705882, "step": 1278 }, { "epoch": 0.8769283510455947, "grad_norm": 0.18211321053106788, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115909258.15873016, "logits/rejected": 90403383.13846155, "logps/chosen": -228.06349206349208, "logps/rejected": -351.5076923076923, "loss": 0.163, "rewards/chosen": 1.9821428571428572, "rewards/margins": 10.082142857142857, "rewards/rejected": -8.1, "step": 1279 }, { "epoch": 0.8776139869729174, "grad_norm": 0.1729480161006309, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93291488.96969697, "logits/rejected": 99648545.03225806, "logps/chosen": -251.3939393939394, "logps/rejected": -386.06451612903226, "loss": 0.1578, "rewards/chosen": 1.5738636363636365, "rewards/margins": -13242709.135813782, "rewards/rejected": 13242710.709677419, "step": 1280 }, { "epoch": 0.87829962290024, "grad_norm": 0.17383261670880687, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116899311.48387097, "logits/rejected": 94562490.18181819, "logps/chosen": -236.1290322580645, "logps/rejected": -400.969696969697, "loss": 0.1544, "rewards/chosen": 1.341733870967742, "rewards/margins": 9.175067204301076, "rewards/rejected": -7.833333333333333, "step": 1281 }, { "epoch": 0.8789852588275625, "grad_norm": 0.16032670045106134, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124125184.0, "logits/rejected": 99090432.0, "logps/chosen": -269.75, "logps/rejected": -423.5, "loss": 0.1542, "rewards/chosen": 2.17333984375, "rewards/margins": 8.75927734375, "rewards/rejected": -6.5859375, "step": 1282 }, { "epoch": 0.8796708947548851, "grad_norm": 0.18200325863518338, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157045743.21311477, "logits/rejected": 25291027.10447761, "logps/chosen": -187.40983606557376, "logps/rejected": -347.7014925373134, "loss": 0.1762, "rewards/chosen": 1.5727459016393444, "rewards/margins": 9.453342916564717, "rewards/rejected": -7.880597014925373, "step": 1283 }, { "epoch": 0.8803565306822078, "grad_norm": 0.1717210940927595, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103459498.66666667, "logits/rejected": 147003590.19354838, "logps/chosen": -231.03030303030303, "logps/rejected": -430.4516129032258, "loss": 0.188, "rewards/chosen": 1.206439393939394, "rewards/margins": 9.44031036168133, "rewards/rejected": -8.233870967741936, "step": 1284 }, { "epoch": 0.8810421666095304, "grad_norm": 0.19568757656112826, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122993534.1971831, "logits/rejected": 43525102.03508772, "logps/chosen": -234.14084507042253, "logps/rejected": -396.63157894736844, "loss": 0.1823, "rewards/chosen": 1.3838028169014085, "rewards/margins": 8.287311588831233, "rewards/rejected": -6.9035087719298245, "step": 1285 }, { "epoch": 0.8817278025368529, "grad_norm": 0.20920990101897816, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79517013.33333333, "logits/rejected": 81048756.70588236, "logps/chosen": -258.4, "logps/rejected": -405.6470588235294, "loss": 0.1712, "rewards/chosen": 1.3447916666666666, "rewards/margins": 10.241850490196079, "rewards/rejected": -8.897058823529411, "step": 1286 }, { "epoch": 0.8824134384641755, "grad_norm": 0.17889238043498684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74598692.57142857, "logits/rejected": 107212863.01538461, "logps/chosen": -211.04761904761904, "logps/rejected": -401.7230769230769, "loss": 0.1702, "rewards/chosen": 1.5912698412698412, "rewards/margins": 9.89896214896215, "rewards/rejected": -8.307692307692308, "step": 1287 }, { "epoch": 0.8830990743914982, "grad_norm": 0.21694676210549554, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120825388.91228071, "logits/rejected": 96409917.29577465, "logps/chosen": -247.2982456140351, "logps/rejected": -395.71830985915494, "loss": 0.1617, "rewards/chosen": 1.105263157894737, "rewards/margins": 10.689770200148258, "rewards/rejected": -9.584507042253522, "step": 1288 }, { "epoch": 0.8837847103188207, "grad_norm": 0.1739174900612876, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137268130.9090909, "logits/rejected": 33452956.903225806, "logps/chosen": -264.969696969697, "logps/rejected": -320.51612903225805, "loss": 0.1424, "rewards/chosen": 2.206439393939394, "rewards/margins": 10.472568426197459, "rewards/rejected": -8.266129032258064, "step": 1289 }, { "epoch": 0.8844703462461433, "grad_norm": 0.16607038441489747, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 38023772.32786885, "logits/rejected": 128458385.19402985, "logps/chosen": -174.81967213114754, "logps/rejected": -463.76119402985074, "loss": 0.1706, "rewards/chosen": 1.3176229508196722, "rewards/margins": 10.56389160753609, "rewards/rejected": -9.246268656716419, "step": 1290 }, { "epoch": 0.8851559821734659, "grad_norm": 0.22114553185630875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116423711.03030303, "logits/rejected": 102016297.29032259, "logps/chosen": -255.03030303030303, "logps/rejected": -414.96774193548384, "loss": 0.1808, "rewards/chosen": 1.5965909090909092, "rewards/margins": 11.38691348973607, "rewards/rejected": -9.790322580645162, "step": 1291 }, { "epoch": 0.8858416181007884, "grad_norm": 0.18995058475515417, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87090062.22222222, "logits/rejected": 148073910.85714287, "logps/chosen": -246.66666666666666, "logps/rejected": -436.0, "loss": 0.1589, "rewards/chosen": 2.09375, "rewards/margins": 9.808035714285715, "rewards/rejected": -7.714285714285714, "step": 1292 }, { "epoch": 0.8865272540281111, "grad_norm": 0.18506928908229528, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90214328.14035088, "logits/rejected": 107456886.9859155, "logps/chosen": -167.2982456140351, "logps/rejected": -402.0281690140845, "loss": 0.1873, "rewards/chosen": 0.8969298245614035, "rewards/margins": 9.629324190758588, "rewards/rejected": -8.732394366197184, "step": 1293 }, { "epoch": 0.8872128899554337, "grad_norm": 0.18868394418611237, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112606398.91525424, "logits/rejected": 108687181.91304348, "logps/chosen": -279.3220338983051, "logps/rejected": -395.1304347826087, "loss": 0.1569, "rewards/chosen": 1.8591101694915255, "rewards/margins": 11.496791328911815, "rewards/rejected": -9.63768115942029, "step": 1294 }, { "epoch": 0.8878985258827563, "grad_norm": 0.1851058471233685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 64978423.87301587, "logits/rejected": 77755943.38461539, "logps/chosen": -210.03174603174602, "logps/rejected": -377.6, "loss": 0.158, "rewards/chosen": 1.8234126984126984, "rewards/margins": 9.238797313797313, "rewards/rejected": -7.415384615384616, "step": 1295 }, { "epoch": 0.8885841618100788, "grad_norm": 0.2012207285316455, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101428922.92063493, "logits/rejected": 74077861.41538462, "logps/chosen": -242.28571428571428, "logps/rejected": -359.38461538461536, "loss": 0.17, "rewards/chosen": 1.6959325396825398, "rewards/margins": 9.865163308913308, "rewards/rejected": -8.169230769230769, "step": 1296 }, { "epoch": 0.8892697977374014, "grad_norm": 0.17156910526968427, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130158724.12903225, "logits/rejected": 43563566.54545455, "logps/chosen": -267.0967741935484, "logps/rejected": -312.72727272727275, "loss": 0.1488, "rewards/chosen": 2.090725806451613, "rewards/margins": 8.393756109481917, "rewards/rejected": -6.303030303030303, "step": 1297 }, { "epoch": 0.8899554336647241, "grad_norm": 0.20893972010527012, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2386414.3448275863, "logits/rejected": 144643569.37142858, "logps/chosen": -208.0, "logps/rejected": -416.9142857142857, "loss": 0.1503, "rewards/chosen": 1.2133620689655173, "rewards/margins": 10.08479064039409, "rewards/rejected": -8.871428571428572, "step": 1298 }, { "epoch": 0.8906410695920466, "grad_norm": 0.17824635592256768, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149123639.13846153, "logits/rejected": 60318086.0952381, "logps/chosen": -243.69230769230768, "logps/rejected": -352.5079365079365, "loss": 0.2015, "rewards/chosen": 1.2769230769230768, "rewards/margins": 7.665811965811966, "rewards/rejected": -6.388888888888889, "step": 1299 }, { "epoch": 0.8913267055193692, "grad_norm": 0.1934851298230947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138992781.7846154, "logits/rejected": 82055233.01587301, "logps/chosen": -222.03076923076924, "logps/rejected": -439.36507936507934, "loss": 0.1838, "rewards/chosen": 1.2875, "rewards/margins": 11.041468253968254, "rewards/rejected": -9.753968253968255, "step": 1300 }, { "epoch": 0.8920123414466918, "grad_norm": 0.17762385456859914, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103026284.16901408, "logits/rejected": 101803852.3508772, "logps/chosen": -257.5774647887324, "logps/rejected": -409.2631578947368, "loss": 0.1642, "rewards/chosen": 1.8380281690140845, "rewards/margins": 10.732765011119348, "rewards/rejected": -8.894736842105264, "step": 1301 }, { "epoch": 0.8926979773740144, "grad_norm": 0.16659895048303344, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146729550.1016949, "logits/rejected": 21245061.56521739, "logps/chosen": -240.8135593220339, "logps/rejected": -313.27536231884056, "loss": 0.1489, "rewards/chosen": 1.3728813559322033, "rewards/margins": 10.227953819700318, "rewards/rejected": -8.855072463768115, "step": 1302 }, { "epoch": 0.893383613301337, "grad_norm": 0.14549121195016795, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128359017.65079366, "logits/rejected": 54267840.984615386, "logps/chosen": -223.87301587301587, "logps/rejected": -362.83076923076925, "loss": 0.1553, "rewards/chosen": 1.8472222222222223, "rewards/margins": 10.670299145299147, "rewards/rejected": -8.823076923076924, "step": 1303 }, { "epoch": 0.8940692492286596, "grad_norm": 0.2295520403898564, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118871196.20338982, "logits/rejected": 53036670.14492754, "logps/chosen": -263.45762711864404, "logps/rejected": -365.4492753623188, "loss": 0.1719, "rewards/chosen": 1.2489406779661016, "rewards/margins": 10.252563866371899, "rewards/rejected": -9.003623188405797, "step": 1304 }, { "epoch": 0.8947548851559822, "grad_norm": 0.20479995886324637, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 63527573.66153846, "logits/rejected": 105190481.26984127, "logps/chosen": -185.84615384615384, "logps/rejected": -361.3968253968254, "loss": 0.1626, "rewards/chosen": 1.2009615384615384, "rewards/margins": 10.73270757020757, "rewards/rejected": -9.531746031746032, "step": 1305 }, { "epoch": 0.8954405210833047, "grad_norm": 0.19233488132496676, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137829489.7777778, "logits/rejected": 83923529.14285715, "logps/chosen": -259.77777777777777, "logps/rejected": -419.42857142857144, "loss": 0.1876, "rewards/chosen": 1.7274305555555556, "rewards/margins": 11.352430555555555, "rewards/rejected": -9.625, "step": 1306 }, { "epoch": 0.8961261570106274, "grad_norm": 0.13901260411290267, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118505732.06349206, "logits/rejected": 83563441.23076923, "logps/chosen": -227.3015873015873, "logps/rejected": -358.89230769230767, "loss": 0.1199, "rewards/chosen": 2.3055555555555554, "rewards/margins": 10.5517094017094, "rewards/rejected": -8.246153846153845, "step": 1307 }, { "epoch": 0.89681179293795, "grad_norm": 0.25523316141146607, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98669282.62295082, "logits/rejected": 108300685.37313433, "logps/chosen": -236.85245901639345, "logps/rejected": -359.1641791044776, "loss": 0.1554, "rewards/chosen": 1.8852459016393444, "rewards/margins": 7.810619035967703, "rewards/rejected": -5.925373134328358, "step": 1308 }, { "epoch": 0.8974974288652725, "grad_norm": 0.16282885136408184, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143147536.51612905, "logits/rejected": 90749486.54545455, "logps/chosen": -253.67741935483872, "logps/rejected": -408.72727272727275, "loss": 0.1459, "rewards/chosen": 1.7278225806451613, "rewards/margins": 11.48539833822092, "rewards/rejected": -9.757575757575758, "step": 1309 }, { "epoch": 0.8981830647925951, "grad_norm": 0.1906483425864972, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93964929.91044776, "logits/rejected": 101763441.31147541, "logps/chosen": -184.955223880597, "logps/rejected": -420.1967213114754, "loss": 0.1753, "rewards/chosen": 1.7164179104477613, "rewards/margins": 11465908.077073649, "rewards/rejected": -11465906.360655738, "step": 1310 }, { "epoch": 0.8988687007199178, "grad_norm": 0.22754884985260423, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119907749.64705883, "logits/rejected": 79447108.26666667, "logps/chosen": -216.11764705882354, "logps/rejected": -384.53333333333336, "loss": 0.1973, "rewards/chosen": 1.1167279411764706, "rewards/margins": 9.891727941176471, "rewards/rejected": -8.775, "step": 1311 }, { "epoch": 0.8995543366472403, "grad_norm": 0.2508094178584205, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145096704.0, "logits/rejected": 70320128.0, "logps/chosen": -225.75, "logps/rejected": -356.5, "loss": 0.1824, "rewards/chosen": 1.576171875, "rewards/margins": 9.755859375, "rewards/rejected": -8.1796875, "step": 1312 }, { "epoch": 0.9002399725745629, "grad_norm": 0.2316694563030909, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 69434796.21818182, "logits/rejected": 119623848.32876712, "logps/chosen": -192.87272727272727, "logps/rejected": -494.4657534246575, "loss": 0.1251, "rewards/chosen": 1.6590909090909092, "rewards/margins": 12.823474470734745, "rewards/rejected": -11.164383561643836, "step": 1313 }, { "epoch": 0.9009256085018855, "grad_norm": 0.2041691798443327, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86600041.41176471, "logits/rejected": 86787140.26666667, "logps/chosen": -233.88235294117646, "logps/rejected": -408.0, "loss": 0.1428, "rewards/chosen": 2.1801470588235294, "rewards/margins": 11.005147058823528, "rewards/rejected": -8.825, "step": 1314 }, { "epoch": 0.901611244429208, "grad_norm": 0.18210158357100498, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141918745.18032786, "logits/rejected": 96735048.59701492, "logps/chosen": -309.7704918032787, "logps/rejected": -414.089552238806, "loss": 0.1419, "rewards/chosen": 1.848360655737705, "rewards/margins": 11.243883043797407, "rewards/rejected": -9.395522388059701, "step": 1315 }, { "epoch": 0.9022968803565307, "grad_norm": 0.18853887727220922, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129617523.61290322, "logits/rejected": 53826901.333333336, "logps/chosen": -279.48387096774195, "logps/rejected": -316.1212121212121, "loss": 0.1525, "rewards/chosen": 2.6754032258064515, "rewards/margins": 10.30419110459433, "rewards/rejected": -7.628787878787879, "step": 1316 }, { "epoch": 0.9029825162838533, "grad_norm": 0.16391431645411877, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132563636.28169014, "logits/rejected": 17264711.85964912, "logps/chosen": -306.92957746478874, "logps/rejected": -365.4736842105263, "loss": 0.1787, "rewards/chosen": 2.1003521126760565, "rewards/margins": 10.284562638991845, "rewards/rejected": -8.18421052631579, "step": 1317 }, { "epoch": 0.9036681522111759, "grad_norm": 0.18747565029139762, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146445190.5084746, "logits/rejected": 58253905.623188406, "logps/chosen": -260.8813559322034, "logps/rejected": -382.1449275362319, "loss": 0.1602, "rewards/chosen": 1.9427966101694916, "rewards/margins": 10.58047776958978, "rewards/rejected": -8.63768115942029, "step": 1318 }, { "epoch": 0.9043537881384984, "grad_norm": 0.18174981873965845, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84724940.8, "logits/rejected": 93724190.11764705, "logps/chosen": -247.2, "logps/rejected": -357.6470588235294, "loss": 0.1241, "rewards/chosen": 2.154166666666667, "rewards/margins": 10.212990196078431, "rewards/rejected": -8.058823529411764, "step": 1319 }, { "epoch": 0.905039424065821, "grad_norm": 0.19864872473894588, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129728050.47887324, "logits/rejected": 63356065.684210524, "logps/chosen": -219.26760563380282, "logps/rejected": -338.5263157894737, "loss": 0.1877, "rewards/chosen": 0.9991197183098591, "rewards/margins": 9.49911971830986, "rewards/rejected": -8.5, "step": 1320 }, { "epoch": 0.9057250599931437, "grad_norm": 0.18017091543859917, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114644309.33333333, "logits/rejected": 81046858.83076923, "logps/chosen": -203.68253968253967, "logps/rejected": -371.2, "loss": 0.1433, "rewards/chosen": 1.7400793650793651, "rewards/margins": 10.278540903540904, "rewards/rejected": -8.538461538461538, "step": 1321 }, { "epoch": 0.9064106959204662, "grad_norm": 0.1750053371377009, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74309085.86666666, "logits/rejected": 74017129.41176471, "logps/chosen": -200.8, "logps/rejected": -362.3529411764706, "loss": 0.1449, "rewards/chosen": 1.4041666666666666, "rewards/margins": 10.95563725490196, "rewards/rejected": -9.551470588235293, "step": 1322 }, { "epoch": 0.9070963318477888, "grad_norm": 0.2480050438329621, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126447951.73770492, "logits/rejected": 100412889.79104477, "logps/chosen": -251.01639344262296, "logps/rejected": -422.2089552238806, "loss": 0.1913, "rewards/chosen": 1.2059426229508197, "rewards/margins": 9.929823219965746, "rewards/rejected": -8.723880597014926, "step": 1323 }, { "epoch": 0.9077819677751114, "grad_norm": 0.2057621621882048, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102477049.08108108, "logits/rejected": 51807421.62962963, "logps/chosen": -177.2972972972973, "logps/rejected": -363.55555555555554, "loss": 0.214, "rewards/chosen": 1.083614864864865, "rewards/margins": 8.166948198198199, "rewards/rejected": -7.083333333333333, "step": 1324 }, { "epoch": 0.908467603702434, "grad_norm": 0.15524880523179424, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 55008629.84126984, "logits/rejected": 120408788.67692308, "logps/chosen": -242.28571428571428, "logps/rejected": -407.6307692307692, "loss": 0.1507, "rewards/chosen": 1.8214285714285714, "rewards/margins": 9.436813186813186, "rewards/rejected": -7.615384615384615, "step": 1325 }, { "epoch": 0.9091532396297566, "grad_norm": 0.16277787437315824, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113508352.0, "logits/rejected": 81657856.0, "logps/chosen": -191.25, "logps/rejected": -380.0, "loss": 0.166, "rewards/chosen": 1.107421875, "rewards/margins": 9.927734375, "rewards/rejected": -8.8203125, "step": 1326 }, { "epoch": 0.9098388755570792, "grad_norm": 0.21098262085082167, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135451346.82352942, "logits/rejected": 59891165.86666667, "logps/chosen": -250.58823529411765, "logps/rejected": -422.4, "loss": 0.1919, "rewards/chosen": 0.5597426470588235, "rewards/margins": 9.276409313725491, "rewards/rejected": -8.716666666666667, "step": 1327 }, { "epoch": 0.9105245114844018, "grad_norm": 0.17253465788718275, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87314757.07936507, "logits/rejected": 101308573.53846154, "logps/chosen": -153.9047619047619, "logps/rejected": -399.75384615384615, "loss": 0.1675, "rewards/chosen": 1.0505952380952381, "rewards/margins": 10.373672161172163, "rewards/rejected": -9.323076923076924, "step": 1328 }, { "epoch": 0.9112101474117243, "grad_norm": 0.19505280921226995, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 61147781.26027397, "logits/rejected": 89700910.54545455, "logps/chosen": -219.17808219178082, "logps/rejected": -311.8545454545455, "loss": 0.1806, "rewards/chosen": 1.731164383561644, "rewards/margins": 10.47661892901619, "rewards/rejected": -8.745454545454546, "step": 1329 }, { "epoch": 0.911895783339047, "grad_norm": 0.20121517899441665, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157438367.53623188, "logits/rejected": 67535403.38983051, "logps/chosen": -308.8695652173913, "logps/rejected": -377.76271186440675, "loss": 0.1784, "rewards/chosen": 1.8496376811594204, "rewards/margins": 9.896247850650946, "rewards/rejected": -8.046610169491526, "step": 1330 }, { "epoch": 0.9125814192663696, "grad_norm": 0.20135362778911983, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114328609.03225806, "logits/rejected": 38122092.60606061, "logps/chosen": -179.3548387096774, "logps/rejected": -367.030303030303, "loss": 0.1696, "rewards/chosen": 1.2752016129032258, "rewards/margins": 9.813080400782013, "rewards/rejected": -8.537878787878787, "step": 1331 }, { "epoch": 0.9132670551936921, "grad_norm": 0.18297027291974, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107607963.27868852, "logits/rejected": 73588124.65671642, "logps/chosen": -243.14754098360655, "logps/rejected": -330.9850746268657, "loss": 0.1339, "rewards/chosen": 2.4385245901639343, "rewards/margins": 10.826584291656472, "rewards/rejected": -8.388059701492537, "step": 1332 }, { "epoch": 0.9139526911210147, "grad_norm": 0.1587037352707054, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137013930.66666666, "logits/rejected": 87590140.67532468, "logps/chosen": -179.6078431372549, "logps/rejected": -359.8961038961039, "loss": 0.1323, "rewards/chosen": 1.2383578431372548, "rewards/margins": 10.30329290807232, "rewards/rejected": -9.064935064935066, "step": 1333 }, { "epoch": 0.9146383270483374, "grad_norm": 0.18614231898104422, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102513724.23529412, "logits/rejected": 77315003.73333333, "logps/chosen": -201.64705882352942, "logps/rejected": -370.1333333333333, "loss": 0.1443, "rewards/chosen": 1.7775735294117647, "rewards/margins": 12.610906862745098, "rewards/rejected": -10.833333333333334, "step": 1334 }, { "epoch": 0.9153239629756599, "grad_norm": 0.17518541103528035, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132275920.5925926, "logits/rejected": 85813192.64864865, "logps/chosen": -217.33333333333334, "logps/rejected": -380.97297297297297, "loss": 0.1349, "rewards/chosen": 1.3321759259259258, "rewards/margins": 10.15650025025025, "rewards/rejected": -8.824324324324325, "step": 1335 }, { "epoch": 0.9160095989029825, "grad_norm": 0.18405515938281158, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131085981.01333334, "logits/rejected": 35493308.37735849, "logps/chosen": -207.78666666666666, "logps/rejected": -359.54716981132077, "loss": 0.1886, "rewards/chosen": 1.1866666666666668, "rewards/margins": -1985905.5303144653, "rewards/rejected": 1985906.716981132, "step": 1336 }, { "epoch": 0.9166952348303051, "grad_norm": 0.18827878920507637, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119957094.4, "logits/rejected": 81427350.06896552, "logps/chosen": -294.85714285714283, "logps/rejected": -409.9310344827586, "loss": 0.176, "rewards/chosen": 1.8053571428571429, "rewards/margins": 10.538115763546799, "rewards/rejected": -8.732758620689655, "step": 1337 }, { "epoch": 0.9173808707576278, "grad_norm": 0.2559599837248222, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157189608.36923078, "logits/rejected": 21071384.38095238, "logps/chosen": -232.36923076923077, "logps/rejected": -346.41269841269843, "loss": 0.1806, "rewards/chosen": 0.9711538461538461, "rewards/margins": 10.84416971916972, "rewards/rejected": -9.873015873015873, "step": 1338 }, { "epoch": 0.9180665066849503, "grad_norm": 0.19321063102648375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102916951.88059701, "logits/rejected": 58513978.75409836, "logps/chosen": -231.16417910447763, "logps/rejected": -410.75409836065575, "loss": 0.1739, "rewards/chosen": 1.328358208955224, "rewards/margins": 10.51688279911916, "rewards/rejected": -9.188524590163935, "step": 1339 }, { "epoch": 0.9187521426122729, "grad_norm": 0.1501152668310502, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102061397.33333333, "logits/rejected": 102328681.41176471, "logps/chosen": -252.53333333333333, "logps/rejected": -401.4117647058824, "loss": 0.1269, "rewards/chosen": 2.31875, "rewards/margins": 11.259926470588235, "rewards/rejected": -8.941176470588236, "step": 1340 }, { "epoch": 0.9194377785395955, "grad_norm": 0.1827319694379326, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156121315.55555555, "logits/rejected": 37711286.85714286, "logps/chosen": -259.77777777777777, "logps/rejected": -394.85714285714283, "loss": 0.1954, "rewards/chosen": 1.796875, "rewards/margins": 8.868303571428571, "rewards/rejected": -7.071428571428571, "step": 1341 }, { "epoch": 0.920123414466918, "grad_norm": 0.2019115639056195, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146292239.5151515, "logits/rejected": 40928289.03225806, "logps/chosen": -267.3939393939394, "logps/rejected": -349.4193548387097, "loss": 0.1638, "rewards/chosen": 1.8058712121212122, "rewards/margins": 9.765548631476051, "rewards/rejected": -7.959677419354839, "step": 1342 }, { "epoch": 0.9208090503942407, "grad_norm": 0.2513846242085668, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95384258.20689656, "logits/rejected": 104018739.2, "logps/chosen": -251.72413793103448, "logps/rejected": -409.14285714285717, "loss": 0.1508, "rewards/chosen": 1.6260775862068966, "rewards/margins": 11.483220443349754, "rewards/rejected": -9.857142857142858, "step": 1343 }, { "epoch": 0.9214946863215633, "grad_norm": 0.19328410259029422, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94750014.95081967, "logits/rejected": 106422638.80597015, "logps/chosen": -272.5245901639344, "logps/rejected": -406.44776119402985, "loss": 0.1642, "rewards/chosen": 1.8545081967213115, "rewards/margins": 11.302269390751164, "rewards/rejected": -9.447761194029852, "step": 1344 }, { "epoch": 0.9221803222488858, "grad_norm": 0.1821423058850185, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107872256.0, "logits/rejected": 95289344.0, "logps/chosen": -290.25, "logps/rejected": -359.75, "loss": 0.1414, "rewards/chosen": 2.212890625, "rewards/margins": 11.947265625, "rewards/rejected": -9.734375, "step": 1345 }, { "epoch": 0.9228659581762084, "grad_norm": 0.19078538578418155, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121183121.72307692, "logits/rejected": 81189741.71428572, "logps/chosen": -209.47692307692307, "logps/rejected": -377.3968253968254, "loss": 0.1721, "rewards/chosen": 1.6730769230769231, "rewards/margins": 11.395299145299145, "rewards/rejected": -9.722222222222221, "step": 1346 }, { "epoch": 0.923551594103531, "grad_norm": 0.23188327984372403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123081127.72413793, "logits/rejected": 53747009.82857143, "logps/chosen": -203.0344827586207, "logps/rejected": -367.0857142857143, "loss": 0.1743, "rewards/chosen": 0.9741379310344828, "rewards/margins": 9.28128078817734, "rewards/rejected": -8.307142857142857, "step": 1347 }, { "epoch": 0.9242372300308536, "grad_norm": 0.2236770712219254, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102188497.45454545, "logits/rejected": 81450677.67741935, "logps/chosen": -223.27272727272728, "logps/rejected": -373.93548387096774, "loss": 0.1684, "rewards/chosen": 1.0208333333333333, "rewards/margins": 10.601478494623656, "rewards/rejected": -9.580645161290322, "step": 1348 }, { "epoch": 0.9249228659581762, "grad_norm": 0.18468204468440774, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140693144.7017544, "logits/rejected": 71657616.22535211, "logps/chosen": -267.7894736842105, "logps/rejected": -360.5633802816901, "loss": 0.1353, "rewards/chosen": 1.638157894736842, "rewards/margins": 9.525481838398814, "rewards/rejected": -7.887323943661972, "step": 1349 }, { "epoch": 0.9256085018854988, "grad_norm": 0.15351986321420397, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98363193.80645162, "logits/rejected": 84458030.54545455, "logps/chosen": -225.41935483870967, "logps/rejected": -355.8787878787879, "loss": 0.1263, "rewards/chosen": 2.1693548387096775, "rewards/margins": 10.66177908113392, "rewards/rejected": -8.492424242424242, "step": 1350 }, { "epoch": 0.9262941378128214, "grad_norm": 0.1889252506757514, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133591712.47761194, "logits/rejected": 69996745.44262294, "logps/chosen": -237.61194029850745, "logps/rejected": -354.0983606557377, "loss": 0.1863, "rewards/chosen": 1.1791044776119404, "rewards/margins": 10.07254710056276, "rewards/rejected": -8.89344262295082, "step": 1351 }, { "epoch": 0.926979773740144, "grad_norm": 0.2113976297020868, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119568964.7761194, "logits/rejected": 91793374.4262295, "logps/chosen": -299.7014925373134, "logps/rejected": -412.8524590163934, "loss": 0.1434, "rewards/chosen": 2.3022388059701493, "rewards/margins": 10.662894543675069, "rewards/rejected": -8.360655737704919, "step": 1352 }, { "epoch": 0.9276654096674666, "grad_norm": 0.19765639242622612, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96744933.05263157, "logits/rejected": 89143728.67605634, "logps/chosen": -267.7894736842105, "logps/rejected": -425.01408450704224, "loss": 0.1257, "rewards/chosen": 2.293859649122807, "rewards/margins": 9.998085001235482, "rewards/rejected": -7.704225352112676, "step": 1353 }, { "epoch": 0.9283510455947892, "grad_norm": 0.21437975621555594, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121035629.71428572, "logits/rejected": 134604894.52307692, "logps/chosen": -262.3492063492063, "logps/rejected": -426.33846153846156, "loss": 0.1552, "rewards/chosen": 2.142857142857143, "rewards/margins": 10.473626373626374, "rewards/rejected": -8.330769230769231, "step": 1354 }, { "epoch": 0.9290366815221117, "grad_norm": 0.20456617857211026, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108670603.63636364, "logits/rejected": 72943682.06451613, "logps/chosen": -265.2121212121212, "logps/rejected": -379.35483870967744, "loss": 0.172, "rewards/chosen": 1.9602272727272727, "rewards/margins": 9.637646627565983, "rewards/rejected": -7.67741935483871, "step": 1355 }, { "epoch": 0.9297223174494343, "grad_norm": 0.1747642053889939, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104045799.22580644, "logits/rejected": 99710045.0909091, "logps/chosen": -212.90322580645162, "logps/rejected": -370.90909090909093, "loss": 0.1442, "rewards/chosen": 1.7338709677419355, "rewards/margins": 11.430840664711633, "rewards/rejected": -9.696969696969697, "step": 1356 }, { "epoch": 0.930407953376757, "grad_norm": 0.17888136388640405, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132094029.7721519, "logits/rejected": 49475667.591836736, "logps/chosen": -282.73417721518985, "logps/rejected": -309.55102040816325, "loss": 0.2135, "rewards/chosen": 1.7136075949367089, "rewards/margins": 7.6931994316714025, "rewards/rejected": -5.979591836734694, "step": 1357 }, { "epoch": 0.9310935893040795, "grad_norm": 0.21166725872537434, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92900451.09677419, "logits/rejected": 87635533.57575758, "logps/chosen": -261.93548387096774, "logps/rejected": -346.1818181818182, "loss": 0.1543, "rewards/chosen": 2.036290322580645, "rewards/margins": 10.39235092864125, "rewards/rejected": -8.356060606060606, "step": 1358 }, { "epoch": 0.9317792252314021, "grad_norm": 0.24183186047934369, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 66307011.76470588, "logits/rejected": 96678707.2, "logps/chosen": -184.0, "logps/rejected": -361.06666666666666, "loss": 0.1908, "rewards/chosen": 1.1231617647058822, "rewards/margins": 8.723161764705882, "rewards/rejected": -7.6, "step": 1359 }, { "epoch": 0.9324648611587247, "grad_norm": 0.17488422497623207, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82037742.6440678, "logits/rejected": 79570201.9710145, "logps/chosen": -257.6271186440678, "logps/rejected": -380.28985507246375, "loss": 0.1553, "rewards/chosen": 1.2690677966101696, "rewards/margins": 11.08066199950872, "rewards/rejected": -9.81159420289855, "step": 1360 }, { "epoch": 0.9331504970860474, "grad_norm": 0.20109141492027624, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149344726.03278688, "logits/rejected": 51317622.44776119, "logps/chosen": -233.31147540983608, "logps/rejected": -355.34328358208955, "loss": 0.1626, "rewards/chosen": 2.096311475409836, "rewards/margins": 9.820192072424762, "rewards/rejected": -7.723880597014926, "step": 1361 }, { "epoch": 0.9338361330133699, "grad_norm": 0.19145111826371036, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109630428.68965517, "logits/rejected": 75928137.14285715, "logps/chosen": -248.0, "logps/rejected": -375.3142857142857, "loss": 0.1545, "rewards/chosen": 1.7165948275862069, "rewards/margins": 9.745166256157635, "rewards/rejected": -8.028571428571428, "step": 1362 }, { "epoch": 0.9345217689406925, "grad_norm": 0.1715366720792653, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138472819.01449275, "logits/rejected": 105213049.49152543, "logps/chosen": -264.81159420289856, "logps/rejected": -346.3050847457627, "loss": 0.165, "rewards/chosen": 2.0905797101449277, "rewards/margins": -7633302.858572832, "rewards/rejected": 7633304.949152542, "step": 1363 }, { "epoch": 0.9352074048680151, "grad_norm": 0.19738270972518146, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130835224.77419356, "logits/rejected": 34977357.57575758, "logps/chosen": -230.19354838709677, "logps/rejected": -351.030303030303, "loss": 0.1663, "rewards/chosen": 1.8810483870967742, "rewards/margins": 11.502260508308895, "rewards/rejected": -9.621212121212121, "step": 1364 }, { "epoch": 0.9358930407953376, "grad_norm": 0.20626584633403805, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152540213.89473686, "logits/rejected": 87999724.3076923, "logps/chosen": -258.94736842105266, "logps/rejected": -413.53846153846155, "loss": 0.1827, "rewards/chosen": 1.6973684210526316, "rewards/margins": 11.014676113360323, "rewards/rejected": -9.317307692307692, "step": 1365 }, { "epoch": 0.9365786767226603, "grad_norm": 0.22115983786868987, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 49582665.14285714, "logits/rejected": 109180959.5076923, "logps/chosen": -180.57142857142858, "logps/rejected": -403.6923076923077, "loss": 0.1694, "rewards/chosen": 1.4861111111111112, "rewards/margins": 10.563034188034187, "rewards/rejected": -9.076923076923077, "step": 1366 }, { "epoch": 0.9372643126499829, "grad_norm": 0.22668202799342177, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104294186.02985075, "logits/rejected": 85192502.55737706, "logps/chosen": -213.97014925373134, "logps/rejected": -385.04918032786884, "loss": 0.1521, "rewards/chosen": 1.2276119402985075, "rewards/margins": 11.129251284560802, "rewards/rejected": -9.901639344262295, "step": 1367 }, { "epoch": 0.9379499485773054, "grad_norm": 0.20224466323122142, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134567253.33333334, "logits/rejected": 98191652.57142857, "logps/chosen": -238.66666666666666, "logps/rejected": -458.2857142857143, "loss": 0.1727, "rewards/chosen": 1.953125, "rewards/margins": 11.542410714285714, "rewards/rejected": -9.589285714285714, "step": 1368 }, { "epoch": 0.938635584504628, "grad_norm": 0.1632924718444259, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117720132.26666667, "logits/rejected": 69236856.47058824, "logps/chosen": -200.0, "logps/rejected": -358.11764705882354, "loss": 0.1557, "rewards/chosen": 1.3973958333333334, "rewards/margins": 10.640042892156863, "rewards/rejected": -9.242647058823529, "step": 1369 }, { "epoch": 0.9393212204319507, "grad_norm": 0.1771577473292931, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117842466.13333334, "logits/rejected": 69977027.76470588, "logps/chosen": -257.8666666666667, "logps/rejected": -346.8235294117647, "loss": 0.1413, "rewards/chosen": 2.0125, "rewards/margins": 8.563970588235295, "rewards/rejected": -6.551470588235294, "step": 1370 }, { "epoch": 0.9400068563592733, "grad_norm": 0.17837995997701744, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94502912.0, "logits/rejected": 97714176.0, "logps/chosen": -256.5, "logps/rejected": -382.5, "loss": 0.1566, "rewards/chosen": 2.169921875, "rewards/margins": 10.982421875, "rewards/rejected": -8.8125, "step": 1371 }, { "epoch": 0.9406924922865958, "grad_norm": 0.15399949940624197, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120894644.70588236, "logits/rejected": 69345826.13333334, "logps/chosen": -223.52941176470588, "logps/rejected": -310.93333333333334, "loss": 0.1526, "rewards/chosen": 1.5974264705882353, "rewards/margins": 9.989093137254903, "rewards/rejected": -8.391666666666667, "step": 1372 }, { "epoch": 0.9413781282139184, "grad_norm": 0.16234326445866307, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82687707.42857143, "logits/rejected": 100147073.96923077, "logps/chosen": -159.4920634920635, "logps/rejected": -389.4153846153846, "loss": 0.1494, "rewards/chosen": 1.4841269841269842, "rewards/margins": 9.907203907203908, "rewards/rejected": -8.423076923076923, "step": 1373 }, { "epoch": 0.942063764141241, "grad_norm": 0.21909152294369416, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151511166.03076923, "logits/rejected": 66900813.20634921, "logps/chosen": -265.3538461538462, "logps/rejected": -367.4920634920635, "loss": 0.1766, "rewards/chosen": 1.5923076923076922, "rewards/margins": 10.616117216117216, "rewards/rejected": -9.023809523809524, "step": 1374 }, { "epoch": 0.9427494000685636, "grad_norm": 0.16498964515989295, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 172652804.65454546, "logits/rejected": 67683426.19178082, "logps/chosen": -246.6909090909091, "logps/rejected": -397.1506849315069, "loss": 0.1176, "rewards/chosen": 1.3113636363636363, "rewards/margins": 11.338760896637607, "rewards/rejected": -10.027397260273972, "step": 1375 }, { "epoch": 0.9434350359958862, "grad_norm": 0.17523620176292135, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104857600.0, "logits/rejected": 114032640.0, "logps/chosen": -177.0, "logps/rejected": -398.5, "loss": 0.1727, "rewards/chosen": 1.072265625, "rewards/margins": 8.939453125, "rewards/rejected": -7.8671875, "step": 1376 }, { "epoch": 0.9441206719232088, "grad_norm": 0.16601262442367806, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105644032.0, "logits/rejected": 58032128.0, "logps/chosen": -245.125, "logps/rejected": -401.5, "loss": 0.1405, "rewards/chosen": 1.99609375, "rewards/margins": 12.00390625, "rewards/rejected": -10.0078125, "step": 1377 }, { "epoch": 0.9448063078505313, "grad_norm": 0.197920705201814, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144826849.88235295, "logits/rejected": 80111206.4, "logps/chosen": -230.35294117647058, "logps/rejected": -383.2, "loss": 0.1816, "rewards/chosen": 1.3308823529411764, "rewards/margins": 10.91421568627451, "rewards/rejected": -9.583333333333334, "step": 1378 }, { "epoch": 0.945491943777854, "grad_norm": 0.2621358838058416, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98024943.48387097, "logits/rejected": 89414935.27272727, "logps/chosen": -268.64516129032256, "logps/rejected": -451.8787878787879, "loss": 0.1459, "rewards/chosen": 2.4798387096774195, "rewards/margins": 11.691959921798633, "rewards/rejected": -9.212121212121213, "step": 1379 }, { "epoch": 0.9461775797051766, "grad_norm": 0.18445617199618938, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96498529.35211268, "logits/rejected": 68562153.54385965, "logps/chosen": -224.22535211267606, "logps/rejected": -363.7894736842105, "loss": 0.1722, "rewards/chosen": 1.9744718309859155, "rewards/margins": 11.097278848529776, "rewards/rejected": -9.12280701754386, "step": 1380 }, { "epoch": 0.9468632156324991, "grad_norm": 0.17638670736227122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138992781.7846154, "logits/rejected": 59086425.396825396, "logps/chosen": -316.0615384615385, "logps/rejected": -364.95238095238096, "loss": 0.1632, "rewards/chosen": 2.044230769230769, "rewards/margins": 9.377564102564103, "rewards/rejected": -7.333333333333333, "step": 1381 }, { "epoch": 0.9475488515598217, "grad_norm": 0.21935735006065232, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98657324.52173913, "logits/rejected": 113814927.18644068, "logps/chosen": -278.4927536231884, "logps/rejected": -477.8305084745763, "loss": 0.1581, "rewards/chosen": 1.8242753623188406, "rewards/margins": 11.502241464013755, "rewards/rejected": -9.677966101694915, "step": 1382 }, { "epoch": 0.9482344874871443, "grad_norm": 0.17512876146486228, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153791146.66666666, "logits/rejected": 40448110.7027027, "logps/chosen": -231.7037037037037, "logps/rejected": -386.5945945945946, "loss": 0.1372, "rewards/chosen": 1.7650462962962963, "rewards/margins": 10.927208458458457, "rewards/rejected": -9.162162162162161, "step": 1383 }, { "epoch": 0.948920123414467, "grad_norm": 0.18883472767754045, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91378324.64516129, "logits/rejected": 117440512.0, "logps/chosen": -274.06451612903226, "logps/rejected": -425.2121212121212, "loss": 0.1456, "rewards/chosen": 2.149193548387097, "rewards/margins": 12.050708699902248, "rewards/rejected": -9.901515151515152, "step": 1384 }, { "epoch": 0.9496057593417895, "grad_norm": 0.19408454121173685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124674166.72463769, "logits/rejected": 76208370.98305085, "logps/chosen": -215.42028985507247, "logps/rejected": -391.59322033898303, "loss": 0.1838, "rewards/chosen": 1.4166666666666667, "rewards/margins": 10.653954802259886, "rewards/rejected": -9.23728813559322, "step": 1385 }, { "epoch": 0.9502913952691121, "grad_norm": 0.23357535466328716, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103292561.19402985, "logits/rejected": 89524324.72131148, "logps/chosen": -220.29850746268656, "logps/rejected": -385.04918032786884, "loss": 0.1555, "rewards/chosen": 1.5111940298507462, "rewards/margins": 11.937423538047467, "rewards/rejected": -10.426229508196721, "step": 1386 }, { "epoch": 0.9509770311964347, "grad_norm": 0.18313413908338932, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109915437.1764706, "logits/rejected": 93812599.46666667, "logps/chosen": -246.58823529411765, "logps/rejected": -405.3333333333333, "loss": 0.1599, "rewards/chosen": 1.7150735294117647, "rewards/margins": 11.88174019607843, "rewards/rejected": -10.166666666666666, "step": 1387 }, { "epoch": 0.9516626671237572, "grad_norm": 0.19150610371646204, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96024141.57575758, "logits/rejected": 38763486.96774194, "logps/chosen": -202.06060606060606, "logps/rejected": -309.4193548387097, "loss": 0.1615, "rewards/chosen": 2.0303030303030303, "rewards/margins": 9.409335288367547, "rewards/rejected": -7.379032258064516, "step": 1388 }, { "epoch": 0.9523483030510799, "grad_norm": 0.22912990834001537, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107576788.61016949, "logits/rejected": 67503979.5942029, "logps/chosen": -246.77966101694915, "logps/rejected": -372.8695652173913, "loss": 0.1534, "rewards/chosen": 1.13135593220339, "rewards/margins": 8.174834193072956, "rewards/rejected": -7.043478260869565, "step": 1389 }, { "epoch": 0.9530339389784025, "grad_norm": 0.1671959355742688, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125763584.0, "logits/rejected": 68288512.0, "logps/chosen": -246.25, "logps/rejected": -395.0, "loss": 0.1354, "rewards/chosen": 2.01171875, "rewards/margins": 11.22265625, "rewards/rejected": -9.2109375, "step": 1390 }, { "epoch": 0.953719574905725, "grad_norm": 0.18879367969118852, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80255189.97014925, "logits/rejected": 91793374.4262295, "logps/chosen": -209.91044776119404, "logps/rejected": -353.04918032786884, "loss": 0.1869, "rewards/chosen": 1.6399253731343284, "rewards/margins": 9.861236848544165, "rewards/rejected": -8.221311475409836, "step": 1391 }, { "epoch": 0.9544052108330476, "grad_norm": 0.18391717351648731, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93597506.95384616, "logits/rejected": 81855504.25396825, "logps/chosen": -302.7692307692308, "logps/rejected": -328.12698412698415, "loss": 0.1442, "rewards/chosen": 2.55, "rewards/margins": 10.581746031746032, "rewards/rejected": -8.031746031746032, "step": 1392 }, { "epoch": 0.9550908467603703, "grad_norm": 0.21406308408695246, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158353372.07017544, "logits/rejected": 69324165.40845071, "logps/chosen": -211.50877192982455, "logps/rejected": -416.0, "loss": 0.1487, "rewards/chosen": 1.9780701754385965, "rewards/margins": 11.154126513466766, "rewards/rejected": -9.17605633802817, "step": 1393 }, { "epoch": 0.9557764826876929, "grad_norm": 0.22319192572640367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123397639.42028986, "logits/rejected": 33305617.355932202, "logps/chosen": -202.43478260869566, "logps/rejected": -366.64406779661016, "loss": 0.1728, "rewards/chosen": 1.4909420289855073, "rewards/margins": 8.558738639154999, "rewards/rejected": -7.067796610169491, "step": 1394 }, { "epoch": 0.9564621186150154, "grad_norm": 0.16774158884274143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104508074.66666667, "logits/rejected": 102883809.88235295, "logps/chosen": -238.66666666666666, "logps/rejected": -426.3529411764706, "loss": 0.1551, "rewards/chosen": 1.9145833333333333, "rewards/margins": 11.88517156862745, "rewards/rejected": -9.970588235294118, "step": 1395 }, { "epoch": 0.957147754542338, "grad_norm": 0.21246693693219654, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84771290.89855072, "logits/rejected": 104244449.62711865, "logps/chosen": -243.47826086956522, "logps/rejected": -385.6271186440678, "loss": 0.1999, "rewards/chosen": 1.6295289855072463, "rewards/margins": 9.476986612625891, "rewards/rejected": -7.8474576271186445, "step": 1396 }, { "epoch": 0.9578333904696607, "grad_norm": 0.18589947648243813, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132367299.76470588, "logits/rejected": 79971396.26666667, "logps/chosen": -296.94117647058823, "logps/rejected": -409.06666666666666, "loss": 0.159, "rewards/chosen": 2.139705882352941, "rewards/margins": 11.581372549019608, "rewards/rejected": -9.441666666666666, "step": 1397 }, { "epoch": 0.9585190263969832, "grad_norm": 0.20170618833866807, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173184165.16129032, "logits/rejected": 58243630.54545455, "logps/chosen": -266.06451612903226, "logps/rejected": -394.6666666666667, "loss": 0.1452, "rewards/chosen": 1.5241935483870968, "rewards/margins": 10.766617790811338, "rewards/rejected": -9.242424242424242, "step": 1398 }, { "epoch": 0.9592046623243058, "grad_norm": 0.2509129880004503, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110849462.85714285, "logits/rejected": 90643569.77777778, "logps/chosen": -261.42857142857144, "logps/rejected": -400.8888888888889, "loss": 0.1702, "rewards/chosen": 1.9017857142857142, "rewards/margins": 10.985119047619047, "rewards/rejected": -9.083333333333334, "step": 1399 }, { "epoch": 0.9598902982516284, "grad_norm": 0.2142568384248356, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91669172.28169014, "logits/rejected": 88935801.26315789, "logps/chosen": -192.90140845070422, "logps/rejected": -464.280701754386, "loss": 0.1556, "rewards/chosen": 1.971830985915493, "rewards/margins": 12.20867309117865, "rewards/rejected": -10.236842105263158, "step": 1400 }, { "epoch": 0.9605759341789509, "grad_norm": 0.16288937300580628, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147724808.6779661, "logits/rejected": 59388913.15942029, "logps/chosen": -221.83050847457628, "logps/rejected": -304.231884057971, "loss": 0.1533, "rewards/chosen": 1.4401483050847457, "rewards/margins": 10.889423667403586, "rewards/rejected": -9.44927536231884, "step": 1401 }, { "epoch": 0.9612615701062736, "grad_norm": 0.2058385783451718, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107986683.93650794, "logits/rejected": 115085248.98461539, "logps/chosen": -274.031746031746, "logps/rejected": -461.7846153846154, "loss": 0.1595, "rewards/chosen": 1.6478174603174602, "rewards/margins": 10.740125152625152, "rewards/rejected": -9.092307692307692, "step": 1402 }, { "epoch": 0.9619472060335962, "grad_norm": 0.2210473986526705, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136515976.76712328, "logits/rejected": 75878772.36363636, "logps/chosen": -279.013698630137, "logps/rejected": -417.1636363636364, "loss": 0.1923, "rewards/chosen": 1.4845890410958904, "rewards/margins": 8.639134495641345, "rewards/rejected": -7.154545454545454, "step": 1403 }, { "epoch": 0.9626328419609188, "grad_norm": 0.1844756736808023, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118567339.94029851, "logits/rejected": 90418192.78688525, "logps/chosen": -236.65671641791045, "logps/rejected": -341.5081967213115, "loss": 0.1679, "rewards/chosen": 1.830223880597015, "rewards/margins": 8.863010765842915, "rewards/rejected": -7.032786885245901, "step": 1404 }, { "epoch": 0.9633184778882413, "grad_norm": 0.2190898529933691, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109533217.5737705, "logits/rejected": 68016586.50746268, "logps/chosen": -233.18032786885246, "logps/rejected": -426.5074626865672, "loss": 0.1521, "rewards/chosen": 1.276639344262295, "rewards/margins": 9.134848299486176, "rewards/rejected": -7.858208955223881, "step": 1405 }, { "epoch": 0.964004113815564, "grad_norm": 0.1856106431785795, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136234220.30769232, "logits/rejected": 44371321.2631579, "logps/chosen": -201.84615384615384, "logps/rejected": -365.89473684210526, "loss": 0.1295, "rewards/chosen": 1.125, "rewards/margins": 10.861842105263158, "rewards/rejected": -9.736842105263158, "step": 1406 }, { "epoch": 0.9646897497428866, "grad_norm": 0.19972315399699772, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125829120.0, "logits/rejected": 77836072.42105263, "logps/chosen": -225.1267605633803, "logps/rejected": -399.1578947368421, "loss": 0.1519, "rewards/chosen": 2.051056338028169, "rewards/margins": 9.226494934519398, "rewards/rejected": -7.175438596491228, "step": 1407 }, { "epoch": 0.9653753856702091, "grad_norm": 0.18472447345250986, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83849922.20689656, "logits/rejected": 114264824.68571429, "logps/chosen": -272.2758620689655, "logps/rejected": -377.14285714285717, "loss": 0.1333, "rewards/chosen": 1.8900862068965518, "rewards/margins": 10.932943349753694, "rewards/rejected": -9.042857142857143, "step": 1408 }, { "epoch": 0.9660610215975317, "grad_norm": 0.1824442136231348, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80814080.0, "logits/rejected": 111869952.0, "logps/chosen": -232.25, "logps/rejected": -370.5, "loss": 0.1649, "rewards/chosen": 1.57421875, "rewards/margins": 11.70703125, "rewards/rejected": -10.1328125, "step": 1409 }, { "epoch": 0.9667466575248543, "grad_norm": 0.16451826932958638, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 9713125.05263158, "logits/rejected": 111060443.94366197, "logps/chosen": -224.28070175438597, "logps/rejected": -452.056338028169, "loss": 0.1422, "rewards/chosen": 1.8070175438596492, "rewards/margins": 12.145045712873733, "rewards/rejected": -10.338028169014084, "step": 1410 }, { "epoch": 0.9674322934521769, "grad_norm": 0.19056019052847498, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102376462.42253521, "logits/rejected": 18028148.771929823, "logps/chosen": -245.18309859154928, "logps/rejected": -382.3157894736842, "loss": 0.14, "rewards/chosen": 2.160211267605634, "rewards/margins": 10.405825302693353, "rewards/rejected": -8.24561403508772, "step": 1411 }, { "epoch": 0.9681179293794995, "grad_norm": 0.1998341482292422, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124872524.3508772, "logits/rejected": 81227718.30985916, "logps/chosen": -276.49122807017545, "logps/rejected": -364.16901408450707, "loss": 0.1269, "rewards/chosen": 1.8081140350877194, "rewards/margins": 10.80811403508772, "rewards/rejected": -9.0, "step": 1412 }, { "epoch": 0.9688035653068221, "grad_norm": 0.2126964175893778, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 78373566.17142858, "logits/rejected": 51994906.48275862, "logps/chosen": -245.71428571428572, "logps/rejected": -308.9655172413793, "loss": 0.1992, "rewards/chosen": 0.6982142857142857, "rewards/margins": 9.35338669950739, "rewards/rejected": -8.655172413793103, "step": 1413 }, { "epoch": 0.9694892012341446, "grad_norm": 0.21573524488391232, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156997137.6551724, "logits/rejected": 56192438.85714286, "logps/chosen": -263.44827586206895, "logps/rejected": -378.0571428571429, "loss": 0.156, "rewards/chosen": 2.0086206896551726, "rewards/margins": 11.337192118226602, "rewards/rejected": -9.32857142857143, "step": 1414 }, { "epoch": 0.9701748371614672, "grad_norm": 0.21317240724047354, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97332525.1764706, "logits/rejected": 110869435.73333333, "logps/chosen": -259.29411764705884, "logps/rejected": -428.0, "loss": 0.168, "rewards/chosen": 1.661764705882353, "rewards/margins": 11.336764705882354, "rewards/rejected": -9.675, "step": 1415 }, { "epoch": 0.9708604730887899, "grad_norm": 0.24844307375414432, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119071630.22222222, "logits/rejected": 50425270.85714286, "logps/chosen": -252.22222222222223, "logps/rejected": -330.57142857142856, "loss": 0.1528, "rewards/chosen": 2.3194444444444446, "rewards/margins": 11.248015873015873, "rewards/rejected": -8.928571428571429, "step": 1416 }, { "epoch": 0.9715461090161125, "grad_norm": 0.1590297261458622, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113017889.03225806, "logits/rejected": 106192151.27272727, "logps/chosen": -253.67741935483872, "logps/rejected": -443.1515151515151, "loss": 0.1469, "rewards/chosen": 1.435483870967742, "rewards/margins": 11.59457478005865, "rewards/rejected": -10.159090909090908, "step": 1417 }, { "epoch": 0.972231744943435, "grad_norm": 0.2852468880108052, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137090538.95890412, "logits/rejected": 85716321.74545455, "logps/chosen": -284.4931506849315, "logps/rejected": -429.96363636363634, "loss": 0.2049, "rewards/chosen": 1.2388698630136987, "rewards/margins": 11.238869863013699, "rewards/rejected": -10.0, "step": 1418 }, { "epoch": 0.9729173808707576, "grad_norm": 0.1843202507246112, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101435930.94736843, "logits/rejected": 99009204.28169014, "logps/chosen": -187.64912280701753, "logps/rejected": -428.61971830985914, "loss": 0.1517, "rewards/chosen": 1.4517543859649122, "rewards/margins": 11.874289597232519, "rewards/rejected": -10.422535211267606, "step": 1419 }, { "epoch": 0.9736030167980803, "grad_norm": 0.2887474999865245, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118878559.08571428, "logits/rejected": 47538458.48275862, "logps/chosen": -250.97142857142856, "logps/rejected": -363.58620689655174, "loss": 0.1755, "rewards/chosen": 1.7446428571428572, "rewards/margins": 10.123953201970442, "rewards/rejected": -8.379310344827585, "step": 1420 }, { "epoch": 0.9742886527254028, "grad_norm": 0.18914325565857285, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107438710.15384616, "logits/rejected": 76296387.04761904, "logps/chosen": -293.4153846153846, "logps/rejected": -388.57142857142856, "loss": 0.1768, "rewards/chosen": 1.3711538461538462, "rewards/margins": 10.021947496947497, "rewards/rejected": -8.65079365079365, "step": 1421 }, { "epoch": 0.9749742886527254, "grad_norm": 0.20170325457653887, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123799618.06451613, "logits/rejected": 60499657.696969695, "logps/chosen": -297.80645161290323, "logps/rejected": -378.6666666666667, "loss": 0.1888, "rewards/chosen": 1.8387096774193548, "rewards/margins": 11.141739980449657, "rewards/rejected": -9.303030303030303, "step": 1422 }, { "epoch": 0.975659924580048, "grad_norm": 0.18398778151271786, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105552948.0677966, "logits/rejected": 133609857.85507247, "logps/chosen": -299.93220338983053, "logps/rejected": -406.72463768115944, "loss": 0.1597, "rewards/chosen": 1.7298728813559323, "rewards/margins": 11.613930852370425, "rewards/rejected": -9.884057971014492, "step": 1423 }, { "epoch": 0.9763455605073705, "grad_norm": 0.17941828053259617, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102105088.0, "logits/rejected": 65470464.0, "logps/chosen": -236.5, "logps/rejected": -376.0, "loss": 0.1638, "rewards/chosen": 1.73046875, "rewards/margins": 9.35546875, "rewards/rejected": -7.625, "step": 1424 }, { "epoch": 0.9770311964346932, "grad_norm": 0.18102660822370453, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104040066.16949153, "logits/rejected": 73278745.9710145, "logps/chosen": -177.89830508474577, "logps/rejected": -390.95652173913044, "loss": 0.1719, "rewards/chosen": 1.2076271186440677, "rewards/margins": 10.381540162122329, "rewards/rejected": -9.173913043478262, "step": 1425 }, { "epoch": 0.9777168323620158, "grad_norm": 0.23199781854564755, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87156215.3220339, "logits/rejected": 126558564.17391305, "logps/chosen": -276.33898305084745, "logps/rejected": -407.6521739130435, "loss": 0.1682, "rewards/chosen": 1.3697033898305084, "rewards/margins": 10.49289179562761, "rewards/rejected": -9.123188405797102, "step": 1426 }, { "epoch": 0.9784024682893384, "grad_norm": 0.19506441296725804, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 71436086.08450705, "logits/rejected": 85762479.15789473, "logps/chosen": -193.35211267605635, "logps/rejected": -372.7719298245614, "loss": 0.1874, "rewards/chosen": 1.5105633802816902, "rewards/margins": 9.984247590808005, "rewards/rejected": -8.473684210526315, "step": 1427 }, { "epoch": 0.9790881042166609, "grad_norm": 0.18032232667973833, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89012451.55555555, "logits/rejected": 98469352.36923076, "logps/chosen": -188.31746031746033, "logps/rejected": -353.96923076923076, "loss": 0.1531, "rewards/chosen": 1.5119047619047619, "rewards/margins": 10.6503663003663, "rewards/rejected": -9.138461538461538, "step": 1428 }, { "epoch": 0.9797737401439836, "grad_norm": 0.21743212145222038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150141865.22033897, "logits/rejected": 55620118.26086956, "logps/chosen": -298.5762711864407, "logps/rejected": -393.27536231884056, "loss": 0.1482, "rewards/chosen": 1.4533898305084745, "rewards/margins": 10.953389830508474, "rewards/rejected": -9.5, "step": 1429 }, { "epoch": 0.9804593760713062, "grad_norm": 0.2194672936050746, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 50627021.52112676, "logits/rejected": 110082083.92982456, "logps/chosen": -186.81690140845072, "logps/rejected": -377.2631578947368, "loss": 0.1679, "rewards/chosen": 2.045774647887324, "rewards/margins": 11.265072893501358, "rewards/rejected": -9.219298245614034, "step": 1430 }, { "epoch": 0.9811450119986287, "grad_norm": 0.168638869686347, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101251970.24561404, "logits/rejected": 75970069.63380282, "logps/chosen": -239.71929824561403, "logps/rejected": -369.1267605633803, "loss": 0.1395, "rewards/chosen": 1.7445175438596492, "rewards/margins": 10.209306276254015, "rewards/rejected": -8.464788732394366, "step": 1431 }, { "epoch": 0.9818306479259513, "grad_norm": 0.17812299045073157, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115487000.5479452, "logits/rejected": 103847154.03636363, "logps/chosen": -228.82191780821918, "logps/rejected": -416.0, "loss": 0.1776, "rewards/chosen": 1.6763698630136987, "rewards/margins": 10.95818804483188, "rewards/rejected": -9.281818181818181, "step": 1432 }, { "epoch": 0.9825162838532739, "grad_norm": 0.194186047152522, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106179093.0410959, "logits/rejected": 74782533.81818181, "logps/chosen": -209.75342465753425, "logps/rejected": -437.5272727272727, "loss": 0.1859, "rewards/chosen": 1.2988013698630136, "rewards/margins": 11.25334682440847, "rewards/rejected": -9.954545454545455, "step": 1433 }, { "epoch": 0.9832019197805965, "grad_norm": 0.21626858738937002, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96689744.84210527, "logits/rejected": 65218473.464788735, "logps/chosen": -252.0701754385965, "logps/rejected": -354.7042253521127, "loss": 0.1351, "rewards/chosen": 2.43859649122807, "rewards/margins": 11.424511984185816, "rewards/rejected": -8.985915492957746, "step": 1434 }, { "epoch": 0.9838875557079191, "grad_norm": 0.1984291575350344, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116731615.54929577, "logits/rejected": 110008499.6491228, "logps/chosen": -247.43661971830986, "logps/rejected": -344.70175438596493, "loss": 0.1716, "rewards/chosen": 1.9929577464788732, "rewards/margins": 10.238571781566593, "rewards/rejected": -8.24561403508772, "step": 1435 }, { "epoch": 0.9845731916352417, "grad_norm": 0.19292922158432008, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79493182.06060606, "logits/rejected": 120856840.25806452, "logps/chosen": -195.3939393939394, "logps/rejected": -427.35483870967744, "loss": 0.1845, "rewards/chosen": 1.5568181818181819, "rewards/margins": 11.26649560117302, "rewards/rejected": -9.709677419354838, "step": 1436 }, { "epoch": 0.9852588275625643, "grad_norm": 0.17840383289083758, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112506036.70588236, "logits/rejected": 95350510.93333334, "logps/chosen": -219.76470588235293, "logps/rejected": -348.8, "loss": 0.1585, "rewards/chosen": 1.568014705882353, "rewards/margins": 5.359681372549019, "rewards/rejected": -3.7916666666666665, "step": 1437 }, { "epoch": 0.9859444634898868, "grad_norm": 0.1850739786020172, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70281710.34482759, "logits/rejected": 70224632.68571429, "logps/chosen": -216.55172413793105, "logps/rejected": -389.0285714285714, "loss": 0.1414, "rewards/chosen": 1.418103448275862, "rewards/margins": 10.268103448275863, "rewards/rejected": -8.85, "step": 1438 }, { "epoch": 0.9866300994172095, "grad_norm": 0.22387901856950077, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108265472.0, "logits/rejected": 96643754.66666667, "logps/chosen": -202.0, "logps/rejected": -480.0, "loss": 0.1481, "rewards/chosen": 1.4732142857142858, "rewards/margins": 11.084325396825397, "rewards/rejected": -9.61111111111111, "step": 1439 }, { "epoch": 0.9873157353445321, "grad_norm": 0.2036811572688832, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115249457.6716418, "logits/rejected": 134406815.47540984, "logps/chosen": -239.52238805970148, "logps/rejected": -475.5409836065574, "loss": 0.1881, "rewards/chosen": 1.3218283582089552, "rewards/margins": 11.264451309028628, "rewards/rejected": -9.942622950819672, "step": 1440 }, { "epoch": 0.9880013712718546, "grad_norm": 0.20356266067027046, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141897439.54929578, "logits/rejected": 74025786.38596492, "logps/chosen": -258.92957746478874, "logps/rejected": -359.859649122807, "loss": 0.1552, "rewards/chosen": 2.306338028169014, "rewards/margins": -11313853.693661971, "rewards/rejected": 11313856.0, "step": 1441 }, { "epoch": 0.9886870071991772, "grad_norm": 0.22193959520559536, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95575123.93442623, "logits/rejected": 113559215.76119404, "logps/chosen": -190.95081967213116, "logps/rejected": -415.5223880597015, "loss": 0.1481, "rewards/chosen": 2.1700819672131146, "rewards/margins": 11.468589429899682, "rewards/rejected": -9.298507462686567, "step": 1442 }, { "epoch": 0.9893726431264999, "grad_norm": 0.2672120087015949, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127858621.93548387, "logits/rejected": 80073076.36363636, "logps/chosen": -226.32258064516128, "logps/rejected": -329.2121212121212, "loss": 0.1667, "rewards/chosen": 1.5887096774193548, "rewards/margins": 9.308406647116325, "rewards/rejected": -7.71969696969697, "step": 1443 }, { "epoch": 0.9900582790538224, "grad_norm": 0.20844737787720133, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106370299.80327868, "logits/rejected": 89629772.41791044, "logps/chosen": -265.7049180327869, "logps/rejected": -386.86567164179104, "loss": 0.1852, "rewards/chosen": 1.7950819672131149, "rewards/margins": 11.257768534377293, "rewards/rejected": -9.462686567164178, "step": 1444 }, { "epoch": 0.990743914981145, "grad_norm": 0.33764998413360775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104992900.12903225, "logits/rejected": 78293674.66666667, "logps/chosen": -201.5483870967742, "logps/rejected": -346.6666666666667, "loss": 0.1534, "rewards/chosen": 1.7409274193548387, "rewards/margins": 10.240927419354838, "rewards/rejected": -8.5, "step": 1445 }, { "epoch": 0.9914295509084676, "grad_norm": 0.19413526765557113, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135141896.6779661, "logits/rejected": 64935728.231884055, "logps/chosen": -261.1525423728813, "logps/rejected": -354.7826086956522, "loss": 0.153, "rewards/chosen": 1.7796610169491525, "rewards/margins": 10.475313190862195, "rewards/rejected": -8.695652173913043, "step": 1446 }, { "epoch": 0.9921151868357901, "grad_norm": 0.22103834533423983, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142999552.0, "logits/rejected": 32702464.0, "logps/chosen": -225.5, "logps/rejected": -354.0, "loss": 0.1571, "rewards/chosen": 1.4140625, "rewards/margins": 11.1484375, "rewards/rejected": -9.734375, "step": 1447 }, { "epoch": 0.9928008227631128, "grad_norm": 0.21151909474045608, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117324003.55555555, "logits/rejected": 77084379.42857143, "logps/chosen": -235.77777777777777, "logps/rejected": -352.0, "loss": 0.1875, "rewards/chosen": 1.8489583333333333, "rewards/margins": 9.152529761904763, "rewards/rejected": -7.303571428571429, "step": 1448 }, { "epoch": 0.9934864586904354, "grad_norm": 0.17055533662220868, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107304277.33333333, "logits/rejected": 90424259.76470588, "logps/chosen": -252.8, "logps/rejected": -432.94117647058823, "loss": 0.147, "rewards/chosen": 1.6479166666666667, "rewards/margins": 12.265563725490196, "rewards/rejected": -10.617647058823529, "step": 1449 }, { "epoch": 0.994172094617758, "grad_norm": 0.17477779036494662, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100728832.0, "logits/rejected": 106168320.0, "logps/chosen": -224.0, "logps/rejected": -412.5, "loss": 0.1656, "rewards/chosen": 1.98046875, "rewards/margins": 8.49609375, "rewards/rejected": -6.515625, "step": 1450 }, { "epoch": 0.9948577305450805, "grad_norm": 0.2175257222712199, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105508440.27586207, "logits/rejected": 80950067.2, "logps/chosen": -224.27586206896552, "logps/rejected": -361.14285714285717, "loss": 0.1665, "rewards/chosen": 0.6551724137931034, "rewards/margins": 10.026600985221675, "rewards/rejected": -9.371428571428572, "step": 1451 }, { "epoch": 0.9955433664724032, "grad_norm": 0.21036374577947053, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101908480.0, "logits/rejected": 97189888.0, "logps/chosen": -235.0, "logps/rejected": -373.0, "loss": 0.1375, "rewards/chosen": 1.7265625, "rewards/margins": 10.8203125, "rewards/rejected": -9.09375, "step": 1452 }, { "epoch": 0.9962290023997258, "grad_norm": 0.2845383560787424, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129965967.78082192, "logits/rejected": 93799889.45454545, "logps/chosen": -277.47945205479454, "logps/rejected": -453.8181818181818, "loss": 0.179, "rewards/chosen": 1.5102739726027397, "rewards/margins": 8.955728518057285, "rewards/rejected": -7.445454545454545, "step": 1453 }, { "epoch": 0.9969146383270483, "grad_norm": 0.2067383379161354, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146594362.75409836, "logits/rejected": 71866581.97014925, "logps/chosen": -253.63934426229508, "logps/rejected": -440.35820895522386, "loss": 0.1518, "rewards/chosen": 1.5922131147540983, "rewards/margins": 10.965347443112307, "rewards/rejected": -9.373134328358208, "step": 1454 }, { "epoch": 0.9976002742543709, "grad_norm": 0.17100663135537653, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85876597.15254237, "logits/rejected": 107410654.60869566, "logps/chosen": -189.01694915254237, "logps/rejected": -387.71014492753625, "loss": 0.1436, "rewards/chosen": 1.507415254237288, "rewards/margins": 10.681328297715549, "rewards/rejected": -9.173913043478262, "step": 1455 }, { "epoch": 0.9982859101816935, "grad_norm": 0.24206982122824752, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93872518.09523809, "logits/rejected": 97888602.58461538, "logps/chosen": -241.52380952380952, "logps/rejected": -400.73846153846154, "loss": 0.1527, "rewards/chosen": 1.8373015873015872, "rewards/margins": 9.837301587301587, "rewards/rejected": -8.0, "step": 1456 }, { "epoch": 0.9989715461090161, "grad_norm": 0.22305042892518856, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109890764.8, "logits/rejected": 97264463.44827586, "logps/chosen": -279.54285714285714, "logps/rejected": -365.7931034482759, "loss": 0.1738, "rewards/chosen": 2.3392857142857144, "rewards/margins": 9.278940886699507, "rewards/rejected": -6.939655172413793, "step": 1457 }, { "epoch": 0.9996571820363387, "grad_norm": 0.20774413369881867, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92768135.52941176, "logits/rejected": 145122918.4, "logps/chosen": -225.88235294117646, "logps/rejected": -427.73333333333335, "loss": 0.1707, "rewards/chosen": 1.6121323529411764, "rewards/margins": 10.57046568627451, "rewards/rejected": -8.958333333333334, "step": 1458 }, { "epoch": 1.0006856359273226, "grad_norm": 0.176493370421652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 59244544.0, "logits/rejected": 106637419.78947368, "logps/chosen": -192.28571428571428, "logps/rejected": -357.05263157894734, "loss": 0.1844, "rewards/chosen": 2.1101190476190474, "rewards/margins": 9.577224310776941, "rewards/rejected": -7.467105263157895, "step": 1459 } ], "logging_steps": 1, "max_steps": 1459, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }