{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.75830078125, "eval_steps": 500, "global_step": 3600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 423.6328125, "epoch": 0.00048828125, "grad_norm": 1.6967331082339958, "kl": 0.0, "learning_rate": 9.998779296875e-07, "loss": -0.0, "reward": 1.3786234855651855, "reward_std": 0.4677655100822449, "rewards/format_reward": 0.8671875, "rewards/ocr_reward": 0.5114360153675079, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 312.5859375, "epoch": 0.0009765625, "grad_norm": 3.2468303824500158, "kl": 0.0004444122314453125, "learning_rate": 9.99755859375e-07, "loss": 0.0, "reward": 1.34132719039917, "reward_std": 0.22886180132627487, "rewards/format_reward": 0.8671875, "rewards/ocr_reward": 0.4741397053003311, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 351.9609375, "epoch": 0.00146484375, "grad_norm": 4.297129503903299, "kl": 0.0007305145263671875, "learning_rate": 9.996337890625e-07, "loss": 0.0, "reward": 1.3343781232833862, "reward_std": 0.3735136389732361, "rewards/format_reward": 0.890625, "rewards/ocr_reward": 0.44375310838222504, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 470.828125, "epoch": 0.001953125, "grad_norm": 3.657808420164072, "kl": 0.00101470947265625, "learning_rate": 9.995117187499999e-07, "loss": 0.0, "reward": 1.2803430557250977, "reward_std": 0.3147875517606735, "rewards/format_reward": 0.828125, "rewards/ocr_reward": 0.4522180110216141, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 302.578125, "epoch": 0.00244140625, "grad_norm": 9.54822062112943, "kl": 0.001285552978515625, "learning_rate": 9.993896484375e-07, "loss": 0.0001, "reward": 1.4602121710777283, "reward_std": 0.26758695393800735, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.49927467107772827, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 428.953125, "epoch": 0.0029296875, "grad_norm": 3.269030427449002, "kl": 0.001796722412109375, "learning_rate": 9.992675781249999e-07, "loss": 0.0001, "reward": 1.3741803765296936, "reward_std": 0.25756245851516724, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.4523053914308548, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 280.5234375, "epoch": 0.00341796875, "grad_norm": 3.1536786445417637, "kl": 0.00457763671875, "learning_rate": 9.991455078125e-07, "loss": 0.0002, "reward": 1.4508002400398254, "reward_std": 0.21975237131118774, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.49767518043518066, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 385.3515625, "epoch": 0.00390625, "grad_norm": 3.829024827560807, "kl": 0.00382232666015625, "learning_rate": 9.990234375e-07, "loss": 0.0002, "reward": 1.437036395072937, "reward_std": 0.16978412866592407, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.4604738652706146, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 315.71875, "epoch": 0.00439453125, "grad_norm": 3.0867259887356244, "kl": 0.00495147705078125, "learning_rate": 9.989013671875e-07, "loss": 0.0002, "reward": 1.474764347076416, "reward_std": 0.22620604932308197, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.498201847076416, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 287.453125, "epoch": 0.0048828125, "grad_norm": 3.2629945996638403, "kl": 0.0056915283203125, "learning_rate": 9.98779296875e-07, "loss": 0.0002, "reward": 1.5907155871391296, "reward_std": 0.14950328320264816, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.590715616941452, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 349.390625, "epoch": 0.00537109375, "grad_norm": 1.6117640951539305, "kl": 0.0078887939453125, "learning_rate": 9.986572265624999e-07, "loss": 0.0003, "reward": 1.427464485168457, "reward_std": 0.19085168838500977, "rewards/format_reward": 0.9140625, "rewards/ocr_reward": 0.5134019106626511, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 343.671875, "epoch": 0.005859375, "grad_norm": 9.963892714501458, "kl": 0.006866455078125, "learning_rate": 9.9853515625e-07, "loss": 0.0003, "reward": 1.506593108177185, "reward_std": 0.21911517158150673, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5378430485725403, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 432.1484375, "epoch": 0.00634765625, "grad_norm": 1.198296114175371, "kl": 0.00494384765625, "learning_rate": 9.984130859374999e-07, "loss": 0.0002, "reward": 1.4732499718666077, "reward_std": 0.16561511158943176, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.5513749718666077, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 344.5234375, "epoch": 0.0068359375, "grad_norm": 2.2044476834833233, "kl": 0.00946044921875, "learning_rate": 9.98291015625e-07, "loss": 0.0004, "reward": 1.3112062215805054, "reward_std": 0.2594592794775963, "rewards/format_reward": 0.8828125, "rewards/ocr_reward": 0.4283936768770218, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 312.5078125, "epoch": 0.00732421875, "grad_norm": 6.042854996633053, "kl": 0.0079193115234375, "learning_rate": 9.981689453125e-07, "loss": 0.0003, "reward": 1.4512476921081543, "reward_std": 0.15800564736127853, "rewards/format_reward": 0.890625, "rewards/ocr_reward": 0.5606226921081543, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 365.2734375, "epoch": 0.0078125, "grad_norm": 5.388723086801915, "kl": 0.008453369140625, "learning_rate": 9.98046875e-07, "loss": 0.0003, "reward": 1.4160526990890503, "reward_std": 0.19370869547128677, "rewards/format_reward": 0.890625, "rewards/ocr_reward": 0.5254276692867279, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 293.1796875, "epoch": 0.00830078125, "grad_norm": 8.561270421888638, "kl": 0.008331298828125, "learning_rate": 9.979248046875e-07, "loss": 0.0003, "reward": 1.5414886474609375, "reward_std": 0.24305763095617294, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5571136474609375, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 306.53125, "epoch": 0.0087890625, "grad_norm": 2.3271079058180053, "kl": 0.0077972412109375, "learning_rate": 9.978027343749999e-07, "loss": 0.0003, "reward": 1.4583409428596497, "reward_std": 0.23799628019332886, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.48959091305732727, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 267.6953125, "epoch": 0.00927734375, "grad_norm": 5.347116097400923, "kl": 0.014739990234375, "learning_rate": 9.976806640625e-07, "loss": 0.0006, "reward": 1.4719247817993164, "reward_std": 0.24416528642177582, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.5422372817993164, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 256.8203125, "epoch": 0.009765625, "grad_norm": 2.1593789937228456, "kl": 0.0086669921875, "learning_rate": 9.9755859375e-07, "loss": 0.0003, "reward": 1.5712983012199402, "reward_std": 0.20670025050640106, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5791108012199402, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 265.53125, "epoch": 0.01025390625, "grad_norm": 5.174610588681323, "kl": 0.010955810546875, "learning_rate": 9.974365234375e-07, "loss": 0.0004, "reward": 1.6467618942260742, "reward_std": 0.17008116841316223, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.670199453830719, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 326.9921875, "epoch": 0.0107421875, "grad_norm": 2.2196458415428073, "kl": 0.0082244873046875, "learning_rate": 9.97314453125e-07, "loss": 0.0003, "reward": 1.5177651643753052, "reward_std": 0.183644600212574, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.5802651941776276, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 228.9140625, "epoch": 0.01123046875, "grad_norm": 2.1272235668522725, "kl": 0.009613037109375, "learning_rate": 9.971923828125e-07, "loss": 0.0004, "reward": 1.6449316143989563, "reward_std": 0.11167065799236298, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6449315845966339, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 342.109375, "epoch": 0.01171875, "grad_norm": 2.695370319521806, "kl": 0.0121002197265625, "learning_rate": 9.970703125e-07, "loss": 0.0005, "reward": 1.7102810740470886, "reward_std": 0.19407786428928375, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7180935442447662, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 356.171875, "epoch": 0.01220703125, "grad_norm": 2.4674039388994022, "kl": 0.010650634765625, "learning_rate": 9.969482421874999e-07, "loss": 0.0004, "reward": 1.4299457669258118, "reward_std": 0.20515850186347961, "rewards/format_reward": 0.8359375, "rewards/ocr_reward": 0.5940082669258118, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 305.2265625, "epoch": 0.0126953125, "grad_norm": 1.7112292524853188, "kl": 0.013671875, "learning_rate": 9.96826171875e-07, "loss": 0.0005, "reward": 1.5269352197647095, "reward_std": 0.12535615265369415, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5425602197647095, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 376.71875, "epoch": 0.01318359375, "grad_norm": 4.748056883738088, "kl": 0.0106964111328125, "learning_rate": 9.967041015625e-07, "loss": 0.0004, "reward": 1.4135064482688904, "reward_std": 0.3039677292108536, "rewards/format_reward": 0.890625, "rewards/ocr_reward": 0.5228813886642456, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 361.1953125, "epoch": 0.013671875, "grad_norm": 2.4024436743008613, "kl": 0.008575439453125, "learning_rate": 9.9658203125e-07, "loss": 0.0003, "reward": 1.4704246520996094, "reward_std": 0.14263245463371277, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.5251121670007706, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 294.078125, "epoch": 0.01416015625, "grad_norm": 2.4271553949012716, "kl": 0.0111083984375, "learning_rate": 9.964599609375e-07, "loss": 0.0004, "reward": 1.679746925830841, "reward_std": 0.17487338185310364, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6953718960285187, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 343.3671875, "epoch": 0.0146484375, "grad_norm": 1.7256261444170102, "kl": 0.01251220703125, "learning_rate": 9.963378906249999e-07, "loss": 0.0005, "reward": 1.3718626499176025, "reward_std": 0.15719684958457947, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.44998762011528015, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 307.625, "epoch": 0.01513671875, "grad_norm": 3.1473342645539613, "kl": 0.015167236328125, "learning_rate": 9.962158203125e-07, "loss": 0.0006, "reward": 1.5455162525177002, "reward_std": 0.09274030476808548, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6080162525177002, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 358.2265625, "epoch": 0.015625, "grad_norm": 2.719177732364667, "kl": 0.01239013671875, "learning_rate": 9.960937499999999e-07, "loss": 0.0005, "reward": 1.3972212672233582, "reward_std": 0.2669922858476639, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.47534629702568054, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 303.8203125, "epoch": 0.01611328125, "grad_norm": 3.6252731496314583, "kl": 0.014862060546875, "learning_rate": 9.959716796875e-07, "loss": 0.0006, "reward": 1.3229502439498901, "reward_std": 0.20802345871925354, "rewards/format_reward": 0.90625, "rewards/ocr_reward": 0.41670016944408417, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 262.5390625, "epoch": 0.0166015625, "grad_norm": 5.151875445266958, "kl": 0.017364501953125, "learning_rate": 9.95849609375e-07, "loss": 0.0007, "reward": 1.5725292563438416, "reward_std": 0.18037345260381699, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5803417265415192, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 368.4296875, "epoch": 0.01708984375, "grad_norm": 1.6068668302465103, "kl": 0.01483154296875, "learning_rate": 9.957275390625e-07, "loss": 0.0006, "reward": 1.6186823844909668, "reward_std": 0.20612449198961258, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6499324142932892, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 294.3828125, "epoch": 0.017578125, "grad_norm": 5.289657901378204, "kl": 0.07623291015625, "learning_rate": 9.9560546875e-07, "loss": 0.0031, "reward": 1.7034948468208313, "reward_std": 0.10497300326824188, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7034948468208313, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 359.1875, "epoch": 0.01806640625, "grad_norm": 2.217914980304441, "kl": 0.008026123046875, "learning_rate": 9.954833984374999e-07, "loss": 0.0003, "reward": 1.5534625053405762, "reward_std": 0.15290548652410507, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.6237750053405762, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 307.5234375, "epoch": 0.0185546875, "grad_norm": 2.4752008410956976, "kl": 0.01519775390625, "learning_rate": 9.95361328125e-07, "loss": 0.0006, "reward": 1.5298476219177246, "reward_std": 0.11099112778902054, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5298476219177246, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 416.5078125, "epoch": 0.01904296875, "grad_norm": 4.386993810362648, "kl": 0.017669677734375, "learning_rate": 9.952392578124999e-07, "loss": 0.0007, "reward": 1.3395265936851501, "reward_std": 0.2638590559363365, "rewards/format_reward": 0.8671875, "rewards/ocr_reward": 0.47233910858631134, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 378.3203125, "epoch": 0.01953125, "grad_norm": 4.4842670885571865, "kl": 0.014923095703125, "learning_rate": 9.951171875e-07, "loss": 0.0006, "reward": 1.5370002388954163, "reward_std": 0.2090120166540146, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.591687798500061, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 292.3359375, "epoch": 0.02001953125, "grad_norm": 8.295700223804772, "kl": 0.0208740234375, "learning_rate": 9.949951171875e-07, "loss": 0.0008, "reward": 1.5621129274368286, "reward_std": 0.14411582052707672, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.562112957239151, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 335.6953125, "epoch": 0.0205078125, "grad_norm": 5.14140191685903, "kl": 0.019622802734375, "learning_rate": 9.94873046875e-07, "loss": 0.0008, "reward": 1.5798521041870117, "reward_std": 0.27509623765945435, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6111020445823669, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 319.2734375, "epoch": 0.02099609375, "grad_norm": 6.525096801696474, "kl": 0.03765869140625, "learning_rate": 9.947509765625e-07, "loss": 0.0015, "reward": 1.5670145750045776, "reward_std": 0.07265551388263702, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6295144557952881, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 305.953125, "epoch": 0.021484375, "grad_norm": 2.1028130721169855, "kl": 0.04144287109375, "learning_rate": 9.946289062499999e-07, "loss": 0.0017, "reward": 1.4101728200912476, "reward_std": 0.20591440051794052, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.48829779028892517, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 474.484375, "epoch": 0.02197265625, "grad_norm": 3.8367388215242437, "kl": 0.0299072265625, "learning_rate": 9.945068359375e-07, "loss": 0.0012, "reward": 1.4227579236030579, "reward_std": 0.262872114777565, "rewards/format_reward": 0.8984375, "rewards/ocr_reward": 0.5243203639984131, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 488.453125, "epoch": 0.0224609375, "grad_norm": 7.321597730270083, "kl": 0.03070068359375, "learning_rate": 9.94384765625e-07, "loss": 0.0012, "reward": 1.377393662929535, "reward_std": 0.22495906800031662, "rewards/format_reward": 0.828125, "rewards/ocr_reward": 0.5492686927318573, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 304.2578125, "epoch": 0.02294921875, "grad_norm": 3.0974827964713625, "kl": 0.0308837890625, "learning_rate": 9.942626953125e-07, "loss": 0.0012, "reward": 1.6084083914756775, "reward_std": 0.09188483282923698, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6084084212779999, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 307.6796875, "epoch": 0.0234375, "grad_norm": 2.5869776097386885, "kl": 0.02886962890625, "learning_rate": 9.94140625e-07, "loss": 0.0012, "reward": 1.6284254789352417, "reward_std": 0.10064487159252167, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6284254491329193, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 372.3359375, "epoch": 0.02392578125, "grad_norm": 2.157246364997451, "kl": 0.03350830078125, "learning_rate": 9.940185546875e-07, "loss": 0.0013, "reward": 1.672927439212799, "reward_std": 0.2006322741508484, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7198024392127991, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 357.8359375, "epoch": 0.0244140625, "grad_norm": 3.073205929906485, "kl": 0.0428466796875, "learning_rate": 9.93896484375e-07, "loss": 0.0017, "reward": 1.679724395275116, "reward_std": 0.1169515885412693, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7109744250774384, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 370.2734375, "epoch": 0.02490234375, "grad_norm": 1.2830008863309907, "kl": 0.027587890625, "learning_rate": 9.937744140624999e-07, "loss": 0.0011, "reward": 1.4477837085723877, "reward_std": 0.17552587389945984, "rewards/format_reward": 0.9140625, "rewards/ocr_reward": 0.5337212085723877, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 329.7265625, "epoch": 0.025390625, "grad_norm": 2.6436926651819443, "kl": 0.03045654296875, "learning_rate": 9.9365234375e-07, "loss": 0.0012, "reward": 1.6024810075759888, "reward_std": 0.1249840036034584, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6102935671806335, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 303.2109375, "epoch": 0.02587890625, "grad_norm": 2.4855128606318893, "kl": 0.02911376953125, "learning_rate": 9.935302734375e-07, "loss": 0.0012, "reward": 1.487706184387207, "reward_std": 0.1580093577504158, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.542393684387207, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 454.6171875, "epoch": 0.0263671875, "grad_norm": 3.8492649462150337, "kl": 0.02069091796875, "learning_rate": 9.93408203125e-07, "loss": 0.0008, "reward": 1.4205285906791687, "reward_std": 0.3576083779335022, "rewards/format_reward": 0.8984375, "rewards/ocr_reward": 0.5220911204814911, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 345.2109375, "epoch": 0.02685546875, "grad_norm": 3.3352616770122663, "kl": 0.02423095703125, "learning_rate": 9.932861328125e-07, "loss": 0.001, "reward": 1.4812852144241333, "reward_std": 0.21911777555942535, "rewards/format_reward": 0.9140625, "rewards/ocr_reward": 0.5672226548194885, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 434.9453125, "epoch": 0.02734375, "grad_norm": 2.599784945706524, "kl": 0.0146484375, "learning_rate": 9.931640625e-07, "loss": 0.0006, "reward": 1.5850829482078552, "reward_std": 0.17178751900792122, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6085204482078552, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 465.875, "epoch": 0.02783203125, "grad_norm": 4.155986577192788, "kl": 0.01739501953125, "learning_rate": 9.930419921875e-07, "loss": 0.0007, "reward": 1.448303461074829, "reward_std": 0.38380755484104156, "rewards/format_reward": 0.875, "rewards/ocr_reward": 0.5733034014701843, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 384.765625, "epoch": 0.0283203125, "grad_norm": 1.736596362026434, "kl": 0.0234375, "learning_rate": 9.929199218749999e-07, "loss": 0.0009, "reward": 1.5162723660469055, "reward_std": 0.2991267442703247, "rewards/format_reward": 0.9140625, "rewards/ocr_reward": 0.6022098660469055, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 365.3203125, "epoch": 0.02880859375, "grad_norm": 4.494090341930586, "kl": 0.03082275390625, "learning_rate": 9.927978515625e-07, "loss": 0.0012, "reward": 1.378541350364685, "reward_std": 0.35002946853637695, "rewards/format_reward": 0.9140625, "rewards/ocr_reward": 0.46447885036468506, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 374.2109375, "epoch": 0.029296875, "grad_norm": 6.756047598527613, "kl": 0.02886962890625, "learning_rate": 9.9267578125e-07, "loss": 0.0012, "reward": 1.3606464862823486, "reward_std": 0.36894528567790985, "rewards/format_reward": 0.890625, "rewards/ocr_reward": 0.47002144157886505, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 251.421875, "epoch": 0.02978515625, "grad_norm": 4.731387630408789, "kl": 0.028076171875, "learning_rate": 9.925537109375e-07, "loss": 0.0011, "reward": 1.6455896496772766, "reward_std": 0.27443696558475494, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6924646198749542, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 330.0, "epoch": 0.0302734375, "grad_norm": 1.9490370706301865, "kl": 0.0244140625, "learning_rate": 9.92431640625e-07, "loss": 0.001, "reward": 1.5852088928222656, "reward_std": 0.3096665292978287, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.6555215120315552, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 430.6875, "epoch": 0.03076171875, "grad_norm": 1.9868062360035326, "kl": 0.01654052734375, "learning_rate": 9.923095703124999e-07, "loss": 0.0007, "reward": 1.5640851855278015, "reward_std": 0.33458730578422546, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6187726855278015, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 430.8046875, "epoch": 0.03125, "grad_norm": 3.1565682997286975, "kl": 0.014373779296875, "learning_rate": 9.921875e-07, "loss": 0.0006, "reward": 1.451416552066803, "reward_std": 0.22569319605827332, "rewards/format_reward": 0.875, "rewards/ocr_reward": 0.576416552066803, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 454.4140625, "epoch": 0.03173828125, "grad_norm": 4.987882156519405, "kl": 0.0198974609375, "learning_rate": 9.920654296874999e-07, "loss": 0.0008, "reward": 1.3604365587234497, "reward_std": 0.3875332325696945, "rewards/format_reward": 0.8359375, "rewards/ocr_reward": 0.5244990885257721, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 441.3359375, "epoch": 0.0322265625, "grad_norm": 2.1756835153062988, "kl": 0.01727294921875, "learning_rate": 9.91943359375e-07, "loss": 0.0007, "reward": 1.476547658443451, "reward_std": 0.2438819855451584, "rewards/format_reward": 0.8984375, "rewards/ocr_reward": 0.5781101584434509, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 311.875, "epoch": 0.03271484375, "grad_norm": 2.9742812230395614, "kl": 0.01806640625, "learning_rate": 9.918212890625e-07, "loss": 0.0007, "reward": 1.575055181980133, "reward_std": 0.09458094835281372, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5750551819801331, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 339.28125, "epoch": 0.033203125, "grad_norm": 5.874510923979259, "kl": 0.01611328125, "learning_rate": 9.9169921875e-07, "loss": 0.0006, "reward": 1.5294025540351868, "reward_std": 0.14596965909004211, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.552839994430542, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 288.8203125, "epoch": 0.03369140625, "grad_norm": 9.28303459781024, "kl": 0.021240234375, "learning_rate": 9.915771484375e-07, "loss": 0.0008, "reward": 1.534590721130371, "reward_std": 0.15341224521398544, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.5736532807350159, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 241.8828125, "epoch": 0.0341796875, "grad_norm": 10.443387554099303, "kl": 0.02099609375, "learning_rate": 9.914550781249999e-07, "loss": 0.0008, "reward": 1.741838276386261, "reward_std": 0.12638744711875916, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7652758061885834, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 222.9140625, "epoch": 0.03466796875, "grad_norm": 6.002627107380703, "kl": 0.02801513671875, "learning_rate": 9.913330078125e-07, "loss": 0.0011, "reward": 1.5784024596214294, "reward_std": 0.19862286746501923, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6018398702144623, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 269.9921875, "epoch": 0.03515625, "grad_norm": 3.901896848624282, "kl": 0.013671875, "learning_rate": 9.912109375e-07, "loss": 0.0005, "reward": 1.4875227212905884, "reward_std": 0.12161608785390854, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.4953352212905884, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 263.0390625, "epoch": 0.03564453125, "grad_norm": 3.5931169826306353, "kl": 0.02130126953125, "learning_rate": 9.910888671875e-07, "loss": 0.0009, "reward": 1.7116557955741882, "reward_std": 0.11958565562963486, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.711655855178833, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 330.4609375, "epoch": 0.0361328125, "grad_norm": 2.716899809253411, "kl": 0.017120361328125, "learning_rate": 9.90966796875e-07, "loss": 0.0007, "reward": 1.5722922682762146, "reward_std": 0.13994912058115005, "rewards/format_reward": 0.890625, "rewards/ocr_reward": 0.6816672682762146, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 441.40625, "epoch": 0.03662109375, "grad_norm": 3.1440498309793634, "kl": 0.0177001953125, "learning_rate": 9.908447265625e-07, "loss": 0.0007, "reward": 1.3587397933006287, "reward_std": 0.1976253017783165, "rewards/format_reward": 0.8359375, "rewards/ocr_reward": 0.5228022933006287, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 415.5703125, "epoch": 0.037109375, "grad_norm": 4.514322351312445, "kl": 0.010009765625, "learning_rate": 9.9072265625e-07, "loss": 0.0004, "reward": 1.4641498923301697, "reward_std": 0.22810623794794083, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.5344623029232025, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 293.765625, "epoch": 0.03759765625, "grad_norm": 1.8710146074895642, "kl": 0.0203857421875, "learning_rate": 9.906005859374999e-07, "loss": 0.0008, "reward": 1.6418211460113525, "reward_std": 0.13721346855163574, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6574461758136749, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 372.859375, "epoch": 0.0380859375, "grad_norm": 2.1667842777691373, "kl": 0.018096923828125, "learning_rate": 9.90478515625e-07, "loss": 0.0007, "reward": 1.5189919471740723, "reward_std": 0.10774445161223412, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.5971169471740723, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 306.0703125, "epoch": 0.03857421875, "grad_norm": 4.797612672867945, "kl": 0.01641845703125, "learning_rate": 9.903564453125e-07, "loss": 0.0007, "reward": 1.6151621341705322, "reward_std": 0.059841278940439224, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6151621639728546, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 284.375, "epoch": 0.0390625, "grad_norm": 3.385092526026789, "kl": 0.01837158203125, "learning_rate": 9.90234375e-07, "loss": 0.0007, "reward": 1.7148075699806213, "reward_std": 0.13570959120988846, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7226200103759766, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 446.6171875, "epoch": 0.03955078125, "grad_norm": 2.6814045791672685, "kl": 0.014251708984375, "learning_rate": 9.901123046875e-07, "loss": 0.0006, "reward": 1.515661358833313, "reward_std": 0.15792688727378845, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.5703488886356354, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 296.6953125, "epoch": 0.0400390625, "grad_norm": 6.013775861153405, "kl": 0.02032470703125, "learning_rate": 9.89990234375e-07, "loss": 0.0008, "reward": 1.5584400296211243, "reward_std": 0.12772930040955544, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5818775296211243, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 371.4140625, "epoch": 0.04052734375, "grad_norm": 1.72500127507919, "kl": 0.0257568359375, "learning_rate": 9.898681640625e-07, "loss": 0.001, "reward": 1.4484447836875916, "reward_std": 0.1641346886754036, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.49531984329223633, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 318.09375, "epoch": 0.041015625, "grad_norm": 2.2558835120683733, "kl": 0.0225830078125, "learning_rate": 9.897460937499999e-07, "loss": 0.0009, "reward": 1.5773499011993408, "reward_std": 0.13273335248231888, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6007874011993408, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 401.1953125, "epoch": 0.04150390625, "grad_norm": 2.729303138391425, "kl": 0.011932373046875, "learning_rate": 9.896240234375e-07, "loss": 0.0005, "reward": 1.5902302265167236, "reward_std": 0.20242100954055786, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6214802265167236, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 326.453125, "epoch": 0.0419921875, "grad_norm": 1.6659116590542917, "kl": 0.013641357421875, "learning_rate": 9.89501953125e-07, "loss": 0.0005, "reward": 1.5991840958595276, "reward_std": 0.08300643041729927, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5991840660572052, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 171.265625, "epoch": 0.04248046875, "grad_norm": 6.5083912720618455, "kl": 0.02520751953125, "learning_rate": 9.893798828125e-07, "loss": 0.001, "reward": 1.5490674376487732, "reward_std": 0.1682056337594986, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5490674078464508, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 306.3984375, "epoch": 0.04296875, "grad_norm": 1.855137877800116, "kl": 0.01483154296875, "learning_rate": 9.892578125e-07, "loss": 0.0006, "reward": 1.6930819749832153, "reward_std": 0.08091134577989578, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6930819451808929, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 268.734375, "epoch": 0.04345703125, "grad_norm": 3.0628004168463323, "kl": 0.0181884765625, "learning_rate": 9.891357421874999e-07, "loss": 0.0007, "reward": 1.5072910785675049, "reward_std": 0.10918539017438889, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5072910487651825, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 318.328125, "epoch": 0.0439453125, "grad_norm": 2.4591507009268003, "kl": 0.0205078125, "learning_rate": 9.89013671875e-07, "loss": 0.0008, "reward": 1.730940043926239, "reward_std": 0.10830854251980782, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7309400737285614, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 289.8984375, "epoch": 0.04443359375, "grad_norm": 1.387374494016842, "kl": 0.015838623046875, "learning_rate": 9.888916015624999e-07, "loss": 0.0006, "reward": 1.575575053691864, "reward_std": 0.13328294083476067, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.583387479186058, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 351.703125, "epoch": 0.044921875, "grad_norm": 3.0271895365123136, "kl": 0.0208740234375, "learning_rate": 9.8876953125e-07, "loss": 0.0008, "reward": 1.5377304553985596, "reward_std": 0.1667354628443718, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.6080429553985596, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 266.578125, "epoch": 0.04541015625, "grad_norm": 2.6921828266080015, "kl": 0.017822265625, "learning_rate": 9.886474609375e-07, "loss": 0.0007, "reward": 1.6924698948860168, "reward_std": 0.1142515130341053, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6924698948860168, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 223.109375, "epoch": 0.0458984375, "grad_norm": 3.472336958554876, "kl": 0.01885986328125, "learning_rate": 9.88525390625e-07, "loss": 0.0008, "reward": 1.5990204811096191, "reward_std": 0.10428202897310257, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6146455407142639, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 372.703125, "epoch": 0.04638671875, "grad_norm": 2.6702059682407917, "kl": 0.02020263671875, "learning_rate": 9.884033203125e-07, "loss": 0.0008, "reward": 1.6923083066940308, "reward_std": 0.181168332695961, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.715745747089386, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 417.0546875, "epoch": 0.046875, "grad_norm": 3.236307882884475, "kl": 0.01593017578125, "learning_rate": 9.882812499999999e-07, "loss": 0.0006, "reward": 1.4725679755210876, "reward_std": 0.18862508982419968, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.49600549042224884, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 287.5078125, "epoch": 0.04736328125, "grad_norm": 3.1662840188332004, "kl": 0.020263671875, "learning_rate": 9.881591796875e-07, "loss": 0.0008, "reward": 1.525817096233368, "reward_std": 0.11687836796045303, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.5805045962333679, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 285.625, "epoch": 0.0478515625, "grad_norm": 2.623130324904824, "kl": 0.01812744140625, "learning_rate": 9.88037109375e-07, "loss": 0.0007, "reward": 1.7077008485794067, "reward_std": 0.15604694932699203, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7077008485794067, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 288.859375, "epoch": 0.04833984375, "grad_norm": 1.591617609181676, "kl": 0.019775390625, "learning_rate": 9.879150390625e-07, "loss": 0.0008, "reward": 1.5951241254806519, "reward_std": 0.14091318100690842, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6029366254806519, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 410.453125, "epoch": 0.048828125, "grad_norm": 4.771714723016947, "kl": 0.01776123046875, "learning_rate": 9.8779296875e-07, "loss": 0.0007, "reward": 1.5604987144470215, "reward_std": 0.19173409044742584, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6229987442493439, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 333.0546875, "epoch": 0.04931640625, "grad_norm": 9.104596570687864, "kl": 0.013702392578125, "learning_rate": 9.876708984375e-07, "loss": 0.0005, "reward": 1.6190659403800964, "reward_std": 0.12071932479739189, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6190659999847412, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 340.5390625, "epoch": 0.0498046875, "grad_norm": 2.1594646925560386, "kl": 0.01641845703125, "learning_rate": 9.87548828125e-07, "loss": 0.0007, "reward": 1.651434302330017, "reward_std": 0.19432562589645386, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6592467725276947, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 337.6171875, "epoch": 0.05029296875, "grad_norm": 3.3906829108536165, "kl": 0.01910400390625, "learning_rate": 9.874267578124999e-07, "loss": 0.0008, "reward": 1.711862325668335, "reward_std": 0.17600611969828606, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7431123554706573, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 379.109375, "epoch": 0.05078125, "grad_norm": 5.335142373058523, "kl": 0.012725830078125, "learning_rate": 9.873046875e-07, "loss": 0.0005, "reward": 1.5398271083831787, "reward_std": 0.13578901067376137, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.5867020785808563, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 345.7421875, "epoch": 0.05126953125, "grad_norm": 2.246739195857841, "kl": 0.016357421875, "learning_rate": 9.871826171875e-07, "loss": 0.0007, "reward": 1.7209094762802124, "reward_std": 0.09800073876976967, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7287219762802124, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 214.34375, "epoch": 0.0517578125, "grad_norm": 3.430606877922797, "kl": 0.02001953125, "learning_rate": 9.87060546875e-07, "loss": 0.0008, "reward": 1.63734370470047, "reward_std": 0.12101611867547035, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6373437345027924, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 357.1171875, "epoch": 0.05224609375, "grad_norm": 6.621962333726358, "kl": 0.01654052734375, "learning_rate": 9.869384765625e-07, "loss": 0.0007, "reward": 1.5627512335777283, "reward_std": 0.20160631090402603, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.6408762633800507, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 439.3046875, "epoch": 0.052734375, "grad_norm": 3.714058389222369, "kl": 0.01226806640625, "learning_rate": 9.8681640625e-07, "loss": 0.0005, "reward": 1.6258893013000488, "reward_std": 0.2035977840423584, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.664951741695404, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 387.0546875, "epoch": 0.05322265625, "grad_norm": 2.225619976721532, "kl": 0.01641845703125, "learning_rate": 9.866943359375e-07, "loss": 0.0007, "reward": 1.5322623252868652, "reward_std": 0.12227768450975418, "rewards/format_reward": 0.9140625, "rewards/ocr_reward": 0.61819988489151, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 279.03125, "epoch": 0.0537109375, "grad_norm": 3.398354345358393, "kl": 0.01690673828125, "learning_rate": 9.865722656249999e-07, "loss": 0.0007, "reward": 1.6276288628578186, "reward_std": 0.07827305793762207, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6745038628578186, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 268.5625, "epoch": 0.05419921875, "grad_norm": 5.939409220904329, "kl": 0.021087646484375, "learning_rate": 9.864501953125e-07, "loss": 0.0008, "reward": 1.574878215789795, "reward_std": 0.08811075612902641, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6373782455921173, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 331.9765625, "epoch": 0.0546875, "grad_norm": 2.2502789319358554, "kl": 0.01654052734375, "learning_rate": 9.86328125e-07, "loss": 0.0007, "reward": 1.6835005283355713, "reward_std": 0.14054467901587486, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6913129687309265, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 319.515625, "epoch": 0.05517578125, "grad_norm": 1.0034153882792571, "kl": 0.0185546875, "learning_rate": 9.862060546875e-07, "loss": 0.0007, "reward": 1.8294273614883423, "reward_std": 0.11719358898699284, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8528648614883423, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 317.7890625, "epoch": 0.0556640625, "grad_norm": 16.17730319934069, "kl": 0.01983642578125, "learning_rate": 9.86083984375e-07, "loss": 0.0008, "reward": 1.5960276126861572, "reward_std": 0.14195309579372406, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6272775828838348, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 341.2578125, "epoch": 0.05615234375, "grad_norm": 2.541506995594959, "kl": 0.024169921875, "learning_rate": 9.859619140624999e-07, "loss": 0.001, "reward": 1.544093132019043, "reward_std": 0.206620991230011, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6065930724143982, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 300.28125, "epoch": 0.056640625, "grad_norm": 2.7963737233551176, "kl": 0.019287109375, "learning_rate": 9.8583984375e-07, "loss": 0.0008, "reward": 1.6467041969299316, "reward_std": 0.07854663208127022, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6545166969299316, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 301.8671875, "epoch": 0.05712890625, "grad_norm": 1.692239110975536, "kl": 0.02508544921875, "learning_rate": 9.857177734374999e-07, "loss": 0.001, "reward": 1.7825125455856323, "reward_std": 0.08674684725701809, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7825126051902771, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 262.9609375, "epoch": 0.0576171875, "grad_norm": 2.115673596589236, "kl": 0.03924560546875, "learning_rate": 9.85595703125e-07, "loss": 0.0016, "reward": 1.5380715131759644, "reward_std": 0.13539821282029152, "rewards/format_reward": 0.9140625, "rewards/ocr_reward": 0.6240090429782867, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 311.109375, "epoch": 0.05810546875, "grad_norm": 15.946926979376824, "kl": 0.0296630859375, "learning_rate": 9.854736328125e-07, "loss": 0.0012, "reward": 1.5573410987854004, "reward_std": 0.11809306219220161, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6198410987854004, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 247.3203125, "epoch": 0.05859375, "grad_norm": 4.256018644115217, "kl": 0.0277099609375, "learning_rate": 9.853515625e-07, "loss": 0.0011, "reward": 1.7416256666183472, "reward_std": 0.11583732068538666, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7494381666183472, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 399.0703125, "epoch": 0.05908203125, "grad_norm": 2.603585855956455, "kl": 0.020050048828125, "learning_rate": 9.852294921875e-07, "loss": 0.0008, "reward": 1.4661349058151245, "reward_std": 0.1207830049097538, "rewards/format_reward": 0.859375, "rewards/ocr_reward": 0.6067598760128021, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 284.03125, "epoch": 0.0595703125, "grad_norm": 4.267397613222153, "kl": 0.0263671875, "learning_rate": 9.851074218749999e-07, "loss": 0.0011, "reward": 1.7082802057266235, "reward_std": 0.1457432433962822, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7395302057266235, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 290.5390625, "epoch": 0.06005859375, "grad_norm": 5.592358687960914, "kl": 0.02459716796875, "learning_rate": 9.849853515625e-07, "loss": 0.001, "reward": 1.653491497039795, "reward_std": 0.1756032481789589, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6613039970397949, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 311.34375, "epoch": 0.060546875, "grad_norm": 1.9342219325316152, "kl": 0.0211181640625, "learning_rate": 9.848632812499999e-07, "loss": 0.0008, "reward": 1.6969304084777832, "reward_std": 0.09714720770716667, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.704742968082428, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 342.8671875, "epoch": 0.06103515625, "grad_norm": 3.5352626846601334, "kl": 0.0206298828125, "learning_rate": 9.847412109375e-07, "loss": 0.0008, "reward": 1.5322385430335999, "reward_std": 0.17621152848005295, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.6103635132312775, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 475.5859375, "epoch": 0.0615234375, "grad_norm": 1.1126647607262938, "kl": 0.01446533203125, "learning_rate": 9.84619140625e-07, "loss": 0.0006, "reward": 1.5217909812927246, "reward_std": 0.14773621410131454, "rewards/format_reward": 0.859375, "rewards/ocr_reward": 0.6624160408973694, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 390.2265625, "epoch": 0.06201171875, "grad_norm": 16.170587561342735, "kl": 0.0191650390625, "learning_rate": 9.844970703125e-07, "loss": 0.0008, "reward": 1.5967344641685486, "reward_std": 0.14839724078774452, "rewards/format_reward": 0.90625, "rewards/ocr_reward": 0.690484493970871, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 301.7265625, "epoch": 0.0625, "grad_norm": 1.7043882185778825, "kl": 0.01751708984375, "learning_rate": 9.84375e-07, "loss": 0.0007, "reward": 1.5539951920509338, "reward_std": 0.12984895333647728, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5696201622486115, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 306.984375, "epoch": 0.06298828125, "grad_norm": 1.3531844603958978, "kl": 0.02239990234375, "learning_rate": 9.842529296874999e-07, "loss": 0.0009, "reward": 1.5346065759658813, "reward_std": 0.08880486711859703, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.5736691057682037, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 248.2890625, "epoch": 0.0634765625, "grad_norm": 4.715371554402326, "kl": 0.022705078125, "learning_rate": 9.84130859375e-07, "loss": 0.0009, "reward": 1.62563157081604, "reward_std": 0.12431228160858154, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6412566304206848, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 295.578125, "epoch": 0.06396484375, "grad_norm": 3.5849539827861907, "kl": 0.02325439453125, "learning_rate": 9.840087890625e-07, "loss": 0.0009, "reward": 1.7144591212272644, "reward_std": 0.13489311560988426, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.730084091424942, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 375.484375, "epoch": 0.064453125, "grad_norm": 1.4393273984880173, "kl": 0.0228271484375, "learning_rate": 9.8388671875e-07, "loss": 0.0009, "reward": 1.6248722076416016, "reward_std": 0.13557805679738522, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6561221778392792, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 362.5390625, "epoch": 0.06494140625, "grad_norm": 8.4663670507505, "kl": 0.01715087890625, "learning_rate": 9.837646484375e-07, "loss": 0.0007, "reward": 1.613499641418457, "reward_std": 0.18968282639980316, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6213121712207794, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 242.625, "epoch": 0.0654296875, "grad_norm": 5.360156728372636, "kl": 0.02471923828125, "learning_rate": 9.83642578125e-07, "loss": 0.001, "reward": 1.6982364058494568, "reward_std": 0.21003302931785583, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7216738760471344, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 255.7890625, "epoch": 0.06591796875, "grad_norm": 2.4693839555178103, "kl": 0.02178955078125, "learning_rate": 9.835205078125e-07, "loss": 0.0009, "reward": 1.6611779928207397, "reward_std": 0.15551955252885818, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6768029928207397, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 289.125, "epoch": 0.06640625, "grad_norm": 2.8329079084560735, "kl": 0.02130126953125, "learning_rate": 9.833984374999999e-07, "loss": 0.0009, "reward": 1.5032538771629333, "reward_std": 0.1515774130821228, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.5579414367675781, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 355.4140625, "epoch": 0.06689453125, "grad_norm": 1.882217780602691, "kl": 0.019775390625, "learning_rate": 9.832763671875e-07, "loss": 0.0008, "reward": 1.484582245349884, "reward_std": 0.11968936026096344, "rewards/format_reward": 0.8828125, "rewards/ocr_reward": 0.6017696857452393, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 347.3046875, "epoch": 0.0673828125, "grad_norm": 3.046463172685156, "kl": 0.0228271484375, "learning_rate": 9.83154296875e-07, "loss": 0.0009, "reward": 1.5993627905845642, "reward_std": 0.20090486854314804, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6462377905845642, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 282.8984375, "epoch": 0.06787109375, "grad_norm": 3.0503455953592784, "kl": 0.02288818359375, "learning_rate": 9.830322265625e-07, "loss": 0.0009, "reward": 1.5906482934951782, "reward_std": 0.15873637050390244, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6140858232975006, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 286.5625, "epoch": 0.068359375, "grad_norm": 2.120223798059756, "kl": 0.02313232421875, "learning_rate": 9.8291015625e-07, "loss": 0.0009, "reward": 1.6196279525756836, "reward_std": 0.11341691762208939, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.619627982378006, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 268.2578125, "epoch": 0.06884765625, "grad_norm": 7.285340344072795, "kl": 0.022216796875, "learning_rate": 9.827880859374999e-07, "loss": 0.0009, "reward": 1.6983768343925476, "reward_std": 0.12035223841667175, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6983768343925476, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 398.0703125, "epoch": 0.0693359375, "grad_norm": 5.9889935265796685, "kl": 0.01953125, "learning_rate": 9.82666015625e-07, "loss": 0.0008, "reward": 1.4262371063232422, "reward_std": 0.17644815146923065, "rewards/format_reward": 0.890625, "rewards/ocr_reward": 0.5356121361255646, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 269.7890625, "epoch": 0.06982421875, "grad_norm": 3.990164163837389, "kl": 0.0240478515625, "learning_rate": 9.825439453124999e-07, "loss": 0.001, "reward": 1.5707527995109558, "reward_std": 0.11035867407917976, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5707527995109558, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 335.375, "epoch": 0.0703125, "grad_norm": 0.8573518157012104, "kl": 0.017333984375, "learning_rate": 9.82421875e-07, "loss": 0.0007, "reward": 1.71940678358078, "reward_std": 0.14653569110669196, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.7819067537784576, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 238.625, "epoch": 0.07080078125, "grad_norm": 3.6713857102420615, "kl": 0.02471923828125, "learning_rate": 9.822998046875e-07, "loss": 0.001, "reward": 1.5883715152740479, "reward_std": 0.04609652329236269, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5883715152740479, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 316.859375, "epoch": 0.0712890625, "grad_norm": 2.2594904451340994, "kl": 0.02471923828125, "learning_rate": 9.82177734375e-07, "loss": 0.001, "reward": 1.5935519933700562, "reward_std": 0.12184244394302368, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6013644337654114, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 317.078125, "epoch": 0.07177734375, "grad_norm": 2.5367376087547227, "kl": 0.020416259765625, "learning_rate": 9.820556640625e-07, "loss": 0.0008, "reward": 1.6828487515449524, "reward_std": 0.09689129143953323, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.68284872174263, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 336.0078125, "epoch": 0.072265625, "grad_norm": 24.91393756517066, "kl": 0.029541015625, "learning_rate": 9.819335937499999e-07, "loss": 0.0012, "reward": 1.576207160949707, "reward_std": 0.1735726036131382, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.623082160949707, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 330.96875, "epoch": 0.07275390625, "grad_norm": 1.7405264781938403, "kl": 0.0450439453125, "learning_rate": 9.818115234375e-07, "loss": 0.0018, "reward": 1.6206218600273132, "reward_std": 0.19472770392894745, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6674968600273132, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 221.7578125, "epoch": 0.0732421875, "grad_norm": 2.391591985194695, "kl": 0.036865234375, "learning_rate": 9.816894531249999e-07, "loss": 0.0015, "reward": 1.6796503067016602, "reward_std": 0.05935625545680523, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6796503067016602, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 289.5859375, "epoch": 0.07373046875, "grad_norm": 2.9623351499295425, "kl": 0.0338134765625, "learning_rate": 9.815673828125e-07, "loss": 0.0014, "reward": 1.6772570610046387, "reward_std": 0.14248831570148468, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7319445908069611, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 325.3125, "epoch": 0.07421875, "grad_norm": 3.546506424634404, "kl": 0.030517578125, "learning_rate": 9.814453125e-07, "loss": 0.0012, "reward": 1.5874695181846619, "reward_std": 0.13588757812976837, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5874694883823395, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 211.953125, "epoch": 0.07470703125, "grad_norm": 6.170642074931075, "kl": 0.0421142578125, "learning_rate": 9.813232421875e-07, "loss": 0.0017, "reward": 1.6709791421890259, "reward_std": 0.08850692212581635, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6709791421890259, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 366.4296875, "epoch": 0.0751953125, "grad_norm": 2.4910705552225747, "kl": 0.03271484375, "learning_rate": 9.81201171875e-07, "loss": 0.0013, "reward": 1.6164610385894775, "reward_std": 0.1261097490787506, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6633360981941223, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 373.140625, "epoch": 0.07568359375, "grad_norm": 1.337539431613464, "kl": 0.0283203125, "learning_rate": 9.810791015624999e-07, "loss": 0.0011, "reward": 1.5733261704444885, "reward_std": 0.23978520929813385, "rewards/format_reward": 0.875, "rewards/ocr_reward": 0.6983261406421661, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 349.8671875, "epoch": 0.076171875, "grad_norm": 3.2537345038786407, "kl": 0.0272216796875, "learning_rate": 9.8095703125e-07, "loss": 0.0011, "reward": 1.5053273439407349, "reward_std": 0.13704759627580643, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5365773737430573, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 391.21875, "epoch": 0.07666015625, "grad_norm": 14.554632484007437, "kl": 0.0223388671875, "learning_rate": 9.808349609375e-07, "loss": 0.0009, "reward": 1.6272760033607483, "reward_std": 0.14983859658241272, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6741509735584259, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 306.359375, "epoch": 0.0771484375, "grad_norm": 2.334403175128852, "kl": 0.0277099609375, "learning_rate": 9.80712890625e-07, "loss": 0.0011, "reward": 1.5797749757766724, "reward_std": 0.08415070176124573, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5797749757766724, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 279.1953125, "epoch": 0.07763671875, "grad_norm": 3.0249691058933306, "kl": 0.03240966796875, "learning_rate": 9.805908203125e-07, "loss": 0.0013, "reward": 1.5711604952812195, "reward_std": 0.08924713358283043, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5867854952812195, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 420.1328125, "epoch": 0.078125, "grad_norm": 6.8155628419050185, "kl": 0.02862548828125, "learning_rate": 9.8046875e-07, "loss": 0.0011, "reward": 1.635881781578064, "reward_std": 0.17367641627788544, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6827567219734192, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 335.8359375, "epoch": 0.07861328125, "grad_norm": 3.0299661554177377, "kl": 0.0274658203125, "learning_rate": 9.803466796875e-07, "loss": 0.0011, "reward": 1.6225927472114563, "reward_std": 0.15056072175502777, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6382177472114563, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 319.4296875, "epoch": 0.0791015625, "grad_norm": 1.7406985012161398, "kl": 0.03253173828125, "learning_rate": 9.802246093749999e-07, "loss": 0.0013, "reward": 1.6942219734191895, "reward_std": 0.0540752187371254, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6942219436168671, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 311.96875, "epoch": 0.07958984375, "grad_norm": 4.477055997348827, "kl": 0.031494140625, "learning_rate": 9.801025390625e-07, "loss": 0.0013, "reward": 1.5531994700431824, "reward_std": 0.16673196852207184, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5844494700431824, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 408.96875, "epoch": 0.080078125, "grad_norm": 1.86554014043665, "kl": 0.0247802734375, "learning_rate": 9.7998046875e-07, "loss": 0.001, "reward": 1.5419456362724304, "reward_std": 0.16365046054124832, "rewards/format_reward": 0.8984375, "rewards/ocr_reward": 0.643508106470108, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 349.3125, "epoch": 0.08056640625, "grad_norm": 6.446669102096267, "kl": 0.03466796875, "learning_rate": 9.798583984375e-07, "loss": 0.0014, "reward": 1.6253865957260132, "reward_std": 0.13813912868499756, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6722615659236908, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 293.359375, "epoch": 0.0810546875, "grad_norm": 2.3945943551584623, "kl": 0.0513916015625, "learning_rate": 9.79736328125e-07, "loss": 0.0021, "reward": 1.5900596380233765, "reward_std": 0.17274170368909836, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6213095486164093, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 324.1640625, "epoch": 0.08154296875, "grad_norm": 4.935892148171108, "kl": 0.0283203125, "learning_rate": 9.796142578125e-07, "loss": 0.0011, "reward": 1.5856729745864868, "reward_std": 0.19583696871995926, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.609110414981842, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 344.7109375, "epoch": 0.08203125, "grad_norm": 2.759482806908666, "kl": 0.0263671875, "learning_rate": 9.794921875e-07, "loss": 0.0011, "reward": 1.6779165267944336, "reward_std": 0.1504085585474968, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7326040267944336, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 296.921875, "epoch": 0.08251953125, "grad_norm": 2.0205610250383703, "kl": 0.03204345703125, "learning_rate": 9.793701171874999e-07, "loss": 0.0013, "reward": 1.6056262850761414, "reward_std": 0.12054416164755821, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6446887850761414, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 314.140625, "epoch": 0.0830078125, "grad_norm": 2.9051619908357025, "kl": 0.029052734375, "learning_rate": 9.79248046875e-07, "loss": 0.0012, "reward": 1.693081021308899, "reward_std": 0.17445684224367142, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7165184915065765, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 334.359375, "epoch": 0.08349609375, "grad_norm": 1.815375912478969, "kl": 0.02777099609375, "learning_rate": 9.791259765625e-07, "loss": 0.0011, "reward": 1.6882360577583313, "reward_std": 0.13848505914211273, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7351110577583313, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 241.953125, "epoch": 0.083984375, "grad_norm": 2.2996318418317734, "kl": 0.029541015625, "learning_rate": 9.7900390625e-07, "loss": 0.0012, "reward": 1.4833272099494934, "reward_std": 0.10671622306108475, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.483327180147171, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 329.0, "epoch": 0.08447265625, "grad_norm": 1.398704102180127, "kl": 0.0325927734375, "learning_rate": 9.788818359375e-07, "loss": 0.0013, "reward": 1.6148168444633484, "reward_std": 0.07767279259860516, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6148169040679932, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 298.6484375, "epoch": 0.0849609375, "grad_norm": 3.4750762750830453, "kl": 0.0400390625, "learning_rate": 9.787597656249999e-07, "loss": 0.0016, "reward": 1.6466941833496094, "reward_std": 0.18221855908632278, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6779442429542542, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 278.7265625, "epoch": 0.08544921875, "grad_norm": 2.3793365576087435, "kl": 0.034912109375, "learning_rate": 9.786376953125e-07, "loss": 0.0014, "reward": 1.702051043510437, "reward_std": 0.17722390592098236, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7098636031150818, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 264.421875, "epoch": 0.0859375, "grad_norm": 5.929036755770266, "kl": 0.0335693359375, "learning_rate": 9.785156249999999e-07, "loss": 0.0013, "reward": 1.702830970287323, "reward_std": 0.10274038091301918, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.702830970287323, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 440.71875, "epoch": 0.08642578125, "grad_norm": 7.4527179001348856, "kl": 0.02398681640625, "learning_rate": 9.783935546875e-07, "loss": 0.001, "reward": 1.5519742965698242, "reward_std": 0.1975010707974434, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6144742965698242, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 248.8515625, "epoch": 0.0869140625, "grad_norm": 4.553092566985503, "kl": 0.03271484375, "learning_rate": 9.78271484375e-07, "loss": 0.0013, "reward": 1.7317935228347778, "reward_std": 0.1101585403084755, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7396060526371002, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 370.828125, "epoch": 0.08740234375, "grad_norm": 1.9568946330604422, "kl": 0.0372314453125, "learning_rate": 9.781494140625e-07, "loss": 0.0015, "reward": 1.4185363054275513, "reward_std": 0.2236497402191162, "rewards/format_reward": 0.796875, "rewards/ocr_reward": 0.6216612756252289, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 304.2421875, "epoch": 0.087890625, "grad_norm": 5.658211765236265, "kl": 0.0313720703125, "learning_rate": 9.7802734375e-07, "loss": 0.0013, "reward": 1.7431809902191162, "reward_std": 0.04815910384058952, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7431809306144714, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 373.2890625, "epoch": 0.08837890625, "grad_norm": 1.3083032198586613, "kl": 0.0234375, "learning_rate": 9.779052734374999e-07, "loss": 0.0009, "reward": 1.8097354173660278, "reward_std": 0.0659454632550478, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8097354471683502, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 322.796875, "epoch": 0.0888671875, "grad_norm": 4.109670831162618, "kl": 0.02972412109375, "learning_rate": 9.77783203125e-07, "loss": 0.0012, "reward": 1.6616966128349304, "reward_std": 0.11840381100773811, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6773216724395752, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 314.4140625, "epoch": 0.08935546875, "grad_norm": 3.291287735681801, "kl": 0.0384521484375, "learning_rate": 9.776611328125e-07, "loss": 0.0015, "reward": 1.6017380952835083, "reward_std": 0.18959469348192215, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6251756846904755, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 309.703125, "epoch": 0.08984375, "grad_norm": 1.3054821132450145, "kl": 0.02685546875, "learning_rate": 9.775390625e-07, "loss": 0.0011, "reward": 1.7018551230430603, "reward_std": 0.06901280581951141, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7018550932407379, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 286.2421875, "epoch": 0.09033203125, "grad_norm": 3.269405991674511, "kl": 0.0325927734375, "learning_rate": 9.774169921875e-07, "loss": 0.0013, "reward": 1.5782784819602966, "reward_std": 0.18268048018217087, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6329659819602966, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 242.5546875, "epoch": 0.0908203125, "grad_norm": 3.8655457222519, "kl": 0.0316162109375, "learning_rate": 9.77294921875e-07, "loss": 0.0013, "reward": 1.626326560974121, "reward_std": 0.10836686193943024, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6263265609741211, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 325.9921875, "epoch": 0.09130859375, "grad_norm": 1.5866438731376145, "kl": 0.032958984375, "learning_rate": 9.771728515625e-07, "loss": 0.0013, "reward": 1.719668209552765, "reward_std": 0.08064623922109604, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7274806201457977, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 323.21875, "epoch": 0.091796875, "grad_norm": 5.9177668434469775, "kl": 0.03240966796875, "learning_rate": 9.770507812499999e-07, "loss": 0.0013, "reward": 1.6977457404136658, "reward_std": 0.15341190993785858, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7211832702159882, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 299.1953125, "epoch": 0.09228515625, "grad_norm": 2.0723613037564426, "kl": 0.0316162109375, "learning_rate": 9.769287109375e-07, "loss": 0.0013, "reward": 1.6144706010818481, "reward_std": 0.15216557681560516, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6144706010818481, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 320.046875, "epoch": 0.0927734375, "grad_norm": 8.245892693245946, "kl": 0.02520751953125, "learning_rate": 9.76806640625e-07, "loss": 0.001, "reward": 1.5022258758544922, "reward_std": 0.1600368544459343, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.5491008907556534, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 269.828125, "epoch": 0.09326171875, "grad_norm": 3.030844823693362, "kl": 0.02996826171875, "learning_rate": 9.766845703125e-07, "loss": 0.0012, "reward": 1.6883333325386047, "reward_std": 0.22212432324886322, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7195833027362823, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 350.0625, "epoch": 0.09375, "grad_norm": 1.8859191428821602, "kl": 0.0260009765625, "learning_rate": 9.765625e-07, "loss": 0.001, "reward": 1.5929180979728699, "reward_std": 0.1562328040599823, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6319805383682251, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 306.875, "epoch": 0.09423828125, "grad_norm": 1.8480988076195144, "kl": 0.02642822265625, "learning_rate": 9.764404296875e-07, "loss": 0.0011, "reward": 1.6515385508537292, "reward_std": 0.13265355303883553, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6827885508537292, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 214.6015625, "epoch": 0.0947265625, "grad_norm": 1.9325979913798101, "kl": 0.03607177734375, "learning_rate": 9.76318359375e-07, "loss": 0.0014, "reward": 1.761966586112976, "reward_std": 0.06584762595593929, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7619665265083313, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 372.2109375, "epoch": 0.09521484375, "grad_norm": 1.9358707417121381, "kl": 0.03466796875, "learning_rate": 9.761962890624999e-07, "loss": 0.0014, "reward": 1.7281653881072998, "reward_std": 0.16422076523303986, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7672278881072998, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 301.328125, "epoch": 0.095703125, "grad_norm": 3.023440850843629, "kl": 0.024658203125, "learning_rate": 9.7607421875e-07, "loss": 0.001, "reward": 1.7193759679794312, "reward_std": 0.17258312553167343, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7428134679794312, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 301.890625, "epoch": 0.09619140625, "grad_norm": 2.595712818329617, "kl": 0.038818359375, "learning_rate": 9.759521484375e-07, "loss": 0.0016, "reward": 1.7148744463920593, "reward_std": 0.1324017532169819, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7226869761943817, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 349.1640625, "epoch": 0.0966796875, "grad_norm": 1.3533967735031216, "kl": 0.02520751953125, "learning_rate": 9.75830078125e-07, "loss": 0.001, "reward": 1.6926743984222412, "reward_std": 0.16136356070637703, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.716111958026886, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 320.78125, "epoch": 0.09716796875, "grad_norm": 1.7402462326884431, "kl": 0.02752685546875, "learning_rate": 9.757080078125e-07, "loss": 0.0011, "reward": 1.608244240283966, "reward_std": 0.1039048321545124, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6082442104816437, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 370.0859375, "epoch": 0.09765625, "grad_norm": 2.518073418544392, "kl": 0.034423828125, "learning_rate": 9.755859374999999e-07, "loss": 0.0014, "reward": 1.5175416469573975, "reward_std": 0.24317501485347748, "rewards/format_reward": 0.8984375, "rewards/ocr_reward": 0.6191041469573975, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 258.9453125, "epoch": 0.09814453125, "grad_norm": 6.096098416889449, "kl": 0.0357666015625, "learning_rate": 9.754638671875e-07, "loss": 0.0014, "reward": 1.6129422783851624, "reward_std": 0.11736492812633514, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6129422634840012, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 333.2734375, "epoch": 0.0986328125, "grad_norm": 3.5514643689891483, "kl": 0.038330078125, "learning_rate": 9.753417968749999e-07, "loss": 0.0015, "reward": 1.7121334075927734, "reward_std": 0.15303652733564377, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7590084075927734, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 360.0, "epoch": 0.09912109375, "grad_norm": 2.7929295597358172, "kl": 0.03955078125, "learning_rate": 9.752197265625e-07, "loss": 0.0016, "reward": 1.5464635491371155, "reward_std": 0.12700794637203217, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5542759895324707, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 338.234375, "epoch": 0.099609375, "grad_norm": 2.0494012473671406, "kl": 0.02825927734375, "learning_rate": 9.7509765625e-07, "loss": 0.0011, "reward": 1.6630714535713196, "reward_std": 0.10232871398329735, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6630714535713196, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 281.4140625, "epoch": 0.10009765625, "grad_norm": 7.6058284423417115, "kl": 0.03277587890625, "learning_rate": 9.749755859375e-07, "loss": 0.0013, "reward": 1.718002438545227, "reward_std": 0.10656377673149109, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.718002438545227, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 328.984375, "epoch": 0.1005859375, "grad_norm": 4.022969206760325, "kl": 0.0401611328125, "learning_rate": 9.74853515625e-07, "loss": 0.0016, "reward": 1.7202000617980957, "reward_std": 0.15844309329986572, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7358251512050629, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 357.8125, "epoch": 0.10107421875, "grad_norm": 1.9668753587480723, "kl": 0.040283203125, "learning_rate": 9.747314453124999e-07, "loss": 0.0016, "reward": 1.4545677304267883, "reward_std": 0.25500622391700745, "rewards/format_reward": 0.90625, "rewards/ocr_reward": 0.5483177602291107, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 274.6796875, "epoch": 0.1015625, "grad_norm": 4.526438674345018, "kl": 0.0428466796875, "learning_rate": 9.74609375e-07, "loss": 0.0017, "reward": 1.6247982382774353, "reward_std": 0.07701070234179497, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6247982978820801, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 419.1875, "epoch": 0.10205078125, "grad_norm": 2.0398164468164968, "kl": 0.0357666015625, "learning_rate": 9.744873046874999e-07, "loss": 0.0014, "reward": 1.587377667427063, "reward_std": 0.21580906957387924, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.642065167427063, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 328.7734375, "epoch": 0.1025390625, "grad_norm": 1.7111018400786906, "kl": 0.03631591796875, "learning_rate": 9.74365234375e-07, "loss": 0.0015, "reward": 1.5716455578804016, "reward_std": 0.14876239746809006, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6185204684734344, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 368.4609375, "epoch": 0.10302734375, "grad_norm": 1.43762144541971, "kl": 0.036865234375, "learning_rate": 9.742431640625e-07, "loss": 0.0015, "reward": 1.7287642359733582, "reward_std": 0.1475791335105896, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7443892359733582, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 346.3203125, "epoch": 0.103515625, "grad_norm": 1.2895212391486286, "kl": 0.03558349609375, "learning_rate": 9.7412109375e-07, "loss": 0.0014, "reward": 1.5150426030158997, "reward_std": 0.18442986905574799, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5462925732135773, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 312.0, "epoch": 0.10400390625, "grad_norm": 15.302170515429042, "kl": 0.04736328125, "learning_rate": 9.739990234375e-07, "loss": 0.0019, "reward": 1.492401361465454, "reward_std": 0.22428305447101593, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5158388316631317, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 331.21875, "epoch": 0.1044921875, "grad_norm": 3.158380258746703, "kl": 0.041748046875, "learning_rate": 9.738769531249999e-07, "loss": 0.0017, "reward": 1.7838861346244812, "reward_std": 0.1373641975224018, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7838861048221588, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 389.8515625, "epoch": 0.10498046875, "grad_norm": 1.9522656170518902, "kl": 0.0390625, "learning_rate": 9.737548828125e-07, "loss": 0.0016, "reward": 1.7652413845062256, "reward_std": 0.1776389330625534, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7964914739131927, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 438.9140625, "epoch": 0.10546875, "grad_norm": 1.1465475583145053, "kl": 0.0390625, "learning_rate": 9.736328125e-07, "loss": 0.0016, "reward": 1.6162505149841309, "reward_std": 0.17765599489212036, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6553130149841309, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 342.8984375, "epoch": 0.10595703125, "grad_norm": 2.8690359668413126, "kl": 0.0386962890625, "learning_rate": 9.735107421875e-07, "loss": 0.0015, "reward": 1.6361339688301086, "reward_std": 0.1363746039569378, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.690821498632431, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 342.28125, "epoch": 0.1064453125, "grad_norm": 3.731840752505406, "kl": 0.04071044921875, "learning_rate": 9.73388671875e-07, "loss": 0.0016, "reward": 1.7846105098724365, "reward_std": 0.11500228941440582, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7846105098724365, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 334.15625, "epoch": 0.10693359375, "grad_norm": 1.3127699847264673, "kl": 0.04736328125, "learning_rate": 9.732666015625e-07, "loss": 0.0019, "reward": 1.7865891456604004, "reward_std": 0.11259111389517784, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7944017052650452, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 292.640625, "epoch": 0.107421875, "grad_norm": 2.5515744642251033, "kl": 0.0418701171875, "learning_rate": 9.7314453125e-07, "loss": 0.0017, "reward": 1.5939872860908508, "reward_std": 0.11360449716448784, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6096123307943344, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 413.9609375, "epoch": 0.10791015625, "grad_norm": 5.193842375036834, "kl": 0.03350830078125, "learning_rate": 9.730224609374999e-07, "loss": 0.0013, "reward": 1.6299118399620056, "reward_std": 0.18084490299224854, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6533493101596832, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 359.28125, "epoch": 0.1083984375, "grad_norm": 3.180517710779925, "kl": 0.03668212890625, "learning_rate": 9.72900390625e-07, "loss": 0.0015, "reward": 1.7199169397354126, "reward_std": 0.15011364966630936, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7511670291423798, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 301.828125, "epoch": 0.10888671875, "grad_norm": 8.771676797515104, "kl": 0.03662109375, "learning_rate": 9.727783203125e-07, "loss": 0.0015, "reward": 1.6318160891532898, "reward_std": 0.060743046924471855, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6318160891532898, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 298.328125, "epoch": 0.109375, "grad_norm": 4.507274379774449, "kl": 0.045166015625, "learning_rate": 9.7265625e-07, "loss": 0.0018, "reward": 1.6135406494140625, "reward_std": 0.11964382976293564, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6369781494140625, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 291.65625, "epoch": 0.10986328125, "grad_norm": 0.7984406651823597, "kl": 0.040771484375, "learning_rate": 9.725341796875e-07, "loss": 0.0016, "reward": 1.5897437930107117, "reward_std": 0.09517102688550949, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6288062930107117, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 443.6015625, "epoch": 0.1103515625, "grad_norm": 3.691513509948555, "kl": 0.02752685546875, "learning_rate": 9.724121093749999e-07, "loss": 0.0011, "reward": 1.5694403648376465, "reward_std": 0.2135012000799179, "rewards/format_reward": 0.90625, "rewards/ocr_reward": 0.6631903648376465, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 277.109375, "epoch": 0.11083984375, "grad_norm": 1.9308963288087744, "kl": 0.031982421875, "learning_rate": 9.722900390625e-07, "loss": 0.0013, "reward": 1.846408486366272, "reward_std": 0.11861564591526985, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8464084565639496, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 398.75, "epoch": 0.111328125, "grad_norm": 2.2747259722712303, "kl": 0.03173828125, "learning_rate": 9.721679687499999e-07, "loss": 0.0013, "reward": 1.3386054635047913, "reward_std": 0.20506983995437622, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.40110543370246887, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 381.1953125, "epoch": 0.11181640625, "grad_norm": 3.898177359401665, "kl": 0.02996826171875, "learning_rate": 9.720458984375e-07, "loss": 0.0012, "reward": 1.562267780303955, "reward_std": 0.21021173894405365, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6091427206993103, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 350.4375, "epoch": 0.1123046875, "grad_norm": 4.223589957552283, "kl": 0.026611328125, "learning_rate": 9.71923828125e-07, "loss": 0.0011, "reward": 1.6675159335136414, "reward_std": 0.16610606014728546, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7065784335136414, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 326.71875, "epoch": 0.11279296875, "grad_norm": 1.379228938803975, "kl": 0.0284423828125, "learning_rate": 9.718017578125e-07, "loss": 0.0011, "reward": 1.5755912065505981, "reward_std": 0.16047358512878418, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6068412065505981, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 296.90625, "epoch": 0.11328125, "grad_norm": 2.347155683071109, "kl": 0.03155517578125, "learning_rate": 9.716796875e-07, "loss": 0.0013, "reward": 1.6779637932777405, "reward_std": 0.13747821748256683, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6857762336730957, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 331.453125, "epoch": 0.11376953125, "grad_norm": 1.9265679656994268, "kl": 0.0267333984375, "learning_rate": 9.715576171874999e-07, "loss": 0.0011, "reward": 1.6846604943275452, "reward_std": 0.11376481875777245, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6846604943275452, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 362.4140625, "epoch": 0.1142578125, "grad_norm": 2.2869527266899348, "kl": 0.02642822265625, "learning_rate": 9.71435546875e-07, "loss": 0.0011, "reward": 1.5628395676612854, "reward_std": 0.1172020323574543, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6175270974636078, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 328.7578125, "epoch": 0.11474609375, "grad_norm": 2.244893512292209, "kl": 0.02557373046875, "learning_rate": 9.713134765624999e-07, "loss": 0.001, "reward": 1.6671748161315918, "reward_std": 0.12159543856978416, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.682799756526947, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 263.390625, "epoch": 0.115234375, "grad_norm": 2.953265867643204, "kl": 0.0301513671875, "learning_rate": 9.7119140625e-07, "loss": 0.0012, "reward": 1.4195521473884583, "reward_std": 0.08454703539609909, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.42736467719078064, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 298.6171875, "epoch": 0.11572265625, "grad_norm": 5.848251432569352, "kl": 0.0361328125, "learning_rate": 9.710693359375e-07, "loss": 0.0014, "reward": 1.5740194916725159, "reward_std": 0.21067717671394348, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5974570214748383, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 367.0546875, "epoch": 0.1162109375, "grad_norm": 3.4952892235581126, "kl": 0.02520751953125, "learning_rate": 9.70947265625e-07, "loss": 0.001, "reward": 1.545112133026123, "reward_std": 0.19367430359125137, "rewards/format_reward": 0.8984375, "rewards/ocr_reward": 0.646674633026123, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 332.25, "epoch": 0.11669921875, "grad_norm": 6.611274101328795, "kl": 0.0341796875, "learning_rate": 9.708251953125e-07, "loss": 0.0014, "reward": 1.7572271823883057, "reward_std": 0.13795867562294006, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7650396823883057, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 296.53125, "epoch": 0.1171875, "grad_norm": 3.1286325507021466, "kl": 0.0322265625, "learning_rate": 9.707031249999999e-07, "loss": 0.0013, "reward": 1.6395533084869385, "reward_std": 0.09670542925596237, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6395533084869385, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 214.703125, "epoch": 0.11767578125, "grad_norm": 1.627275271283892, "kl": 0.038330078125, "learning_rate": 9.705810546875e-07, "loss": 0.0015, "reward": 1.644788920879364, "reward_std": 0.04493547976016998, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6447888910770416, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 338.7890625, "epoch": 0.1181640625, "grad_norm": 6.14755791950804, "kl": 0.03076171875, "learning_rate": 9.70458984375e-07, "loss": 0.0012, "reward": 1.6965675354003906, "reward_std": 0.12545205652713776, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7043800354003906, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 262.296875, "epoch": 0.11865234375, "grad_norm": 4.504232771428641, "kl": 0.0355224609375, "learning_rate": 9.703369140625e-07, "loss": 0.0014, "reward": 1.6774699091911316, "reward_std": 0.10419408231973648, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6930948793888092, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 280.484375, "epoch": 0.119140625, "grad_norm": 1.7486968845761774, "kl": 0.03228759765625, "learning_rate": 9.7021484375e-07, "loss": 0.0013, "reward": 1.8203869462013245, "reward_std": 0.08420379087328911, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8203868865966797, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 344.53125, "epoch": 0.11962890625, "grad_norm": 2.2888106990202304, "kl": 0.031982421875, "learning_rate": 9.700927734375e-07, "loss": 0.0013, "reward": 1.5356090068817139, "reward_std": 0.18412478268146515, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.5746715664863586, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 378.40625, "epoch": 0.1201171875, "grad_norm": 4.440936065671452, "kl": 0.02557373046875, "learning_rate": 9.69970703125e-07, "loss": 0.001, "reward": 1.6864354610443115, "reward_std": 0.16976945102214813, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7176855206489563, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 333.296875, "epoch": 0.12060546875, "grad_norm": 1.6696154402341608, "kl": 0.03326416015625, "learning_rate": 9.698486328124999e-07, "loss": 0.0013, "reward": 1.6419482827186584, "reward_std": 0.141361266374588, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6810107231140137, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 337.453125, "epoch": 0.12109375, "grad_norm": 5.609952930401551, "kl": 0.03277587890625, "learning_rate": 9.697265625e-07, "loss": 0.0013, "reward": 1.6594606637954712, "reward_std": 0.09116644039750099, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6594606637954712, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 348.3125, "epoch": 0.12158203125, "grad_norm": 1.6010400179727664, "kl": 0.028076171875, "learning_rate": 9.696044921875e-07, "loss": 0.0011, "reward": 1.548350989818573, "reward_std": 0.11171835660934448, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5639760047197342, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 389.5078125, "epoch": 0.1220703125, "grad_norm": 8.329224730907894, "kl": 0.0340576171875, "learning_rate": 9.69482421875e-07, "loss": 0.0014, "reward": 1.4914612770080566, "reward_std": 0.21047968417406082, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.569586306810379, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 353.4296875, "epoch": 0.12255859375, "grad_norm": 3.6020583542087117, "kl": 0.030029296875, "learning_rate": 9.693603515625e-07, "loss": 0.0012, "reward": 1.811837911605835, "reward_std": 0.045381875708699226, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8118377923965454, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 437.6171875, "epoch": 0.123046875, "grad_norm": 2.8891967781818044, "kl": 0.02545166015625, "learning_rate": 9.6923828125e-07, "loss": 0.001, "reward": 1.5089460015296936, "reward_std": 0.3081662133336067, "rewards/format_reward": 0.90625, "rewards/ocr_reward": 0.602696031332016, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 265.8046875, "epoch": 0.12353515625, "grad_norm": 3.246272295841944, "kl": 0.04095458984375, "learning_rate": 9.691162109375e-07, "loss": 0.0016, "reward": 1.7035585045814514, "reward_std": 0.11963363364338875, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7035585343837738, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 284.7109375, "epoch": 0.1240234375, "grad_norm": 3.151160447979875, "kl": 0.0423583984375, "learning_rate": 9.689941406249999e-07, "loss": 0.0017, "reward": 1.6344158053398132, "reward_std": 0.18827372789382935, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6656658351421356, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 380.65625, "epoch": 0.12451171875, "grad_norm": 6.070686166424575, "kl": 0.03094482421875, "learning_rate": 9.688720703125e-07, "loss": 0.0012, "reward": 1.71599280834198, "reward_std": 0.15431293100118637, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7394302487373352, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 303.0234375, "epoch": 0.125, "grad_norm": 0.9304227502971951, "kl": 0.03057861328125, "learning_rate": 9.6875e-07, "loss": 0.0012, "reward": 1.7525382041931152, "reward_std": 0.09551074542105198, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7759757041931152, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 278.71875, "epoch": 0.12548828125, "grad_norm": 3.7527081977389494, "kl": 0.0892333984375, "learning_rate": 9.686279296875e-07, "loss": 0.0036, "reward": 1.8059654235839844, "reward_std": 0.11925885081291199, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8137778639793396, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 332.890625, "epoch": 0.1259765625, "grad_norm": 3.2980897081154468, "kl": 0.027099609375, "learning_rate": 9.68505859375e-07, "loss": 0.0011, "reward": 1.7137970328330994, "reward_std": 0.13171366602182388, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7294220626354218, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 284.953125, "epoch": 0.12646484375, "grad_norm": 4.3680814156942285, "kl": 0.055419921875, "learning_rate": 9.683837890624999e-07, "loss": 0.0022, "reward": 1.719020664691925, "reward_std": 0.10069620236754417, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7268331944942474, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 266.1484375, "epoch": 0.126953125, "grad_norm": 1.7575782040816468, "kl": 0.032470703125, "learning_rate": 9.6826171875e-07, "loss": 0.0013, "reward": 1.756038784980774, "reward_std": 0.1373431235551834, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7560386955738068, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 256.1640625, "epoch": 0.12744140625, "grad_norm": 2.2674231639337674, "kl": 0.0382080078125, "learning_rate": 9.681396484374999e-07, "loss": 0.0015, "reward": 1.6681320667266846, "reward_std": 0.08800495602190495, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6681320667266846, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 358.0390625, "epoch": 0.1279296875, "grad_norm": 2.133067633460261, "kl": 0.032470703125, "learning_rate": 9.68017578125e-07, "loss": 0.0013, "reward": 1.7665959596633911, "reward_std": 0.11527542397379875, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7744084894657135, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 294.7734375, "epoch": 0.12841796875, "grad_norm": 2.9133910121332476, "kl": 0.0313720703125, "learning_rate": 9.678955078125e-07, "loss": 0.0013, "reward": 1.622836172580719, "reward_std": 0.08527448028326035, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6228361874818802, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 279.0, "epoch": 0.12890625, "grad_norm": 2.595596598906522, "kl": 0.0341796875, "learning_rate": 9.677734375e-07, "loss": 0.0014, "reward": 1.753430426120758, "reward_std": 0.06863740459084511, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7534304261207581, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 314.4609375, "epoch": 0.12939453125, "grad_norm": 1.9387469073690122, "kl": 0.0390625, "learning_rate": 9.676513671875e-07, "loss": 0.0016, "reward": 1.6287448406219482, "reward_std": 0.15640820562839508, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6443698704242706, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 250.453125, "epoch": 0.1298828125, "grad_norm": 13.216425880817694, "kl": 0.0343017578125, "learning_rate": 9.675292968749999e-07, "loss": 0.0014, "reward": 1.731309413909912, "reward_std": 0.08267020061612129, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.731309324502945, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 239.59375, "epoch": 0.13037109375, "grad_norm": 4.862426726552091, "kl": 0.0419921875, "learning_rate": 9.674072265625e-07, "loss": 0.0017, "reward": 1.648730993270874, "reward_std": 0.07836638763546944, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6487309336662292, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 351.9375, "epoch": 0.130859375, "grad_norm": 21.763882065889554, "kl": 0.0308837890625, "learning_rate": 9.6728515625e-07, "loss": 0.0012, "reward": 1.6850923895835876, "reward_std": 0.10728929005563259, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7007173895835876, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 288.265625, "epoch": 0.13134765625, "grad_norm": 2.082497501815107, "kl": 0.0380859375, "learning_rate": 9.671630859375e-07, "loss": 0.0015, "reward": 1.667827844619751, "reward_std": 0.09125854074954987, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6678277850151062, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 258.4140625, "epoch": 0.1318359375, "grad_norm": 3.1061731600297717, "kl": 0.0426025390625, "learning_rate": 9.67041015625e-07, "loss": 0.0017, "reward": 1.6657472848892212, "reward_std": 0.10530559718608856, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.665747344493866, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 406.1328125, "epoch": 0.13232421875, "grad_norm": 1.6926718726678105, "kl": 0.03216552734375, "learning_rate": 9.669189453125e-07, "loss": 0.0013, "reward": 1.6102675795555115, "reward_std": 0.20465338230133057, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6415176093578339, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 273.875, "epoch": 0.1328125, "grad_norm": 1.5814783438080073, "kl": 0.0360107421875, "learning_rate": 9.66796875e-07, "loss": 0.0014, "reward": 1.6680699586868286, "reward_std": 0.0880473144352436, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6680700480937958, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 327.203125, "epoch": 0.13330078125, "grad_norm": 2.340261215855065, "kl": 0.0401611328125, "learning_rate": 9.666748046874999e-07, "loss": 0.0016, "reward": 1.7241803407669067, "reward_std": 0.1692553162574768, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7476178705692291, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 318.7578125, "epoch": 0.1337890625, "grad_norm": 4.11544829128727, "kl": 0.0391845703125, "learning_rate": 9.66552734375e-07, "loss": 0.0016, "reward": 1.781490683555603, "reward_std": 0.13933787494897842, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7971156537532806, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 314.2578125, "epoch": 0.13427734375, "grad_norm": 1.4526251271367776, "kl": 0.0401611328125, "learning_rate": 9.664306640625e-07, "loss": 0.0016, "reward": 1.6937137246131897, "reward_std": 0.1856069192290306, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7327762842178345, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 275.578125, "epoch": 0.134765625, "grad_norm": 10.727511575491055, "kl": 0.0391845703125, "learning_rate": 9.6630859375e-07, "loss": 0.0016, "reward": 1.5141828656196594, "reward_std": 0.12065092846751213, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5219953954219818, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 359.8671875, "epoch": 0.13525390625, "grad_norm": 1.9833864945403907, "kl": 0.03375244140625, "learning_rate": 9.661865234375e-07, "loss": 0.0013, "reward": 1.536266803741455, "reward_std": 0.21020027250051498, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.6143918633460999, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 372.4140625, "epoch": 0.1357421875, "grad_norm": 6.895919430163141, "kl": 0.029541015625, "learning_rate": 9.66064453125e-07, "loss": 0.0012, "reward": 1.6948537826538086, "reward_std": 0.11981324478983879, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.702666312456131, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 308.328125, "epoch": 0.13623046875, "grad_norm": 2.2534331703734067, "kl": 0.03424072265625, "learning_rate": 9.659423828125e-07, "loss": 0.0014, "reward": 1.6411468386650085, "reward_std": 0.08064734004437923, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6411468386650085, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 249.625, "epoch": 0.13671875, "grad_norm": 3.863896913907151, "kl": 0.04150390625, "learning_rate": 9.658203124999999e-07, "loss": 0.0017, "reward": 1.6285604238510132, "reward_std": 0.12783172726631165, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6754354536533356, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 362.609375, "epoch": 0.13720703125, "grad_norm": 2.933038355393098, "kl": 0.02764892578125, "learning_rate": 9.656982421875e-07, "loss": 0.0011, "reward": 1.7419158220291138, "reward_std": 0.14980874210596085, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7653533220291138, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 380.703125, "epoch": 0.1376953125, "grad_norm": 9.395073865019247, "kl": 0.03985595703125, "learning_rate": 9.65576171875e-07, "loss": 0.0016, "reward": 1.6920581459999084, "reward_std": 0.12204625830054283, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6920581459999084, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 319.3984375, "epoch": 0.13818359375, "grad_norm": 3.37707988325681, "kl": 0.035400390625, "learning_rate": 9.654541015625e-07, "loss": 0.0014, "reward": 1.497445821762085, "reward_std": 0.1840338483452797, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.5365082919597626, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 300.3046875, "epoch": 0.138671875, "grad_norm": 2.0075105685871426, "kl": 0.032958984375, "learning_rate": 9.6533203125e-07, "loss": 0.0013, "reward": 1.6478480100631714, "reward_std": 0.11625828593969345, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6634730100631714, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 322.578125, "epoch": 0.13916015625, "grad_norm": 6.271628293640765, "kl": 0.03070068359375, "learning_rate": 9.652099609374999e-07, "loss": 0.0012, "reward": 1.5877465605735779, "reward_std": 0.18424838036298752, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6268090903759003, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 302.734375, "epoch": 0.1396484375, "grad_norm": 2.765333625422615, "kl": 0.039306640625, "learning_rate": 9.65087890625e-07, "loss": 0.0016, "reward": 1.6684794425964355, "reward_std": 0.21452812105417252, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.699729323387146, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 322.046875, "epoch": 0.14013671875, "grad_norm": 2.836462248192525, "kl": 0.0313720703125, "learning_rate": 9.649658203124999e-07, "loss": 0.0013, "reward": 1.7276391983032227, "reward_std": 0.19272325932979584, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7667016685009003, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 323.953125, "epoch": 0.140625, "grad_norm": 1.5790291022053742, "kl": 0.02911376953125, "learning_rate": 9.6484375e-07, "loss": 0.0012, "reward": 1.6144769787788391, "reward_std": 0.1834145449101925, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6535394489765167, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 356.28125, "epoch": 0.14111328125, "grad_norm": 1.1042158738010264, "kl": 0.0274658203125, "learning_rate": 9.647216796875e-07, "loss": 0.0011, "reward": 1.6875471472740173, "reward_std": 0.1275060921907425, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7031721770763397, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 411.125, "epoch": 0.1416015625, "grad_norm": 2.283777941321073, "kl": 0.0245361328125, "learning_rate": 9.64599609375e-07, "loss": 0.001, "reward": 1.6654972434043884, "reward_std": 0.1885884590446949, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7045597434043884, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 386.46875, "epoch": 0.14208984375, "grad_norm": 1.5795066656688896, "kl": 0.0257568359375, "learning_rate": 9.644775390625e-07, "loss": 0.001, "reward": 1.7022438049316406, "reward_std": 0.1274988241493702, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7256813049316406, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 385.4921875, "epoch": 0.142578125, "grad_norm": 2.172103990155339, "kl": 0.024658203125, "learning_rate": 9.643554687499999e-07, "loss": 0.001, "reward": 1.608510136604309, "reward_std": 0.11927095800638199, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6475726366043091, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 256.3125, "epoch": 0.14306640625, "grad_norm": 3.281405859635334, "kl": 0.038330078125, "learning_rate": 9.642333984375e-07, "loss": 0.0015, "reward": 1.5347102880477905, "reward_std": 0.10195699892938137, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5659602731466293, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 315.8046875, "epoch": 0.1435546875, "grad_norm": 1.4610945064230194, "kl": 0.02362060546875, "learning_rate": 9.64111328125e-07, "loss": 0.0009, "reward": 1.6751810312271118, "reward_std": 0.1327841766178608, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6986185312271118, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 302.4375, "epoch": 0.14404296875, "grad_norm": 2.083770286674266, "kl": 0.03302001953125, "learning_rate": 9.639892578125e-07, "loss": 0.0013, "reward": 1.7501333951950073, "reward_std": 0.09881668537855148, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7579458951950073, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 312.078125, "epoch": 0.14453125, "grad_norm": 3.9481991676001216, "kl": 0.0372314453125, "learning_rate": 9.638671875e-07, "loss": 0.0015, "reward": 1.6677301526069641, "reward_std": 0.07496082410216331, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6677302122116089, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 286.6484375, "epoch": 0.14501953125, "grad_norm": 7.04024659813487, "kl": 0.0308837890625, "learning_rate": 9.637451171875e-07, "loss": 0.0012, "reward": 1.7570677399635315, "reward_std": 0.09553324803709984, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7570676803588867, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 294.3203125, "epoch": 0.1455078125, "grad_norm": 4.161631659200875, "kl": 0.0411376953125, "learning_rate": 9.63623046875e-07, "loss": 0.0016, "reward": 1.5669713020324707, "reward_std": 0.17707626521587372, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5904087424278259, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 282.9375, "epoch": 0.14599609375, "grad_norm": 2.9329021992243134, "kl": 0.0474853515625, "learning_rate": 9.635009765624999e-07, "loss": 0.0019, "reward": 1.6684596538543701, "reward_std": 0.1166144497692585, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6997096538543701, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 307.0390625, "epoch": 0.146484375, "grad_norm": 4.806487359363194, "kl": 0.033935546875, "learning_rate": 9.6337890625e-07, "loss": 0.0014, "reward": 1.83830726146698, "reward_std": 0.04638480953872204, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8383072018623352, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 361.6875, "epoch": 0.14697265625, "grad_norm": 1.8400115433509951, "kl": 0.03753662109375, "learning_rate": 9.632568359375e-07, "loss": 0.0015, "reward": 1.6406999826431274, "reward_std": 0.05689780414104462, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.640699952840805, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 426.078125, "epoch": 0.1474609375, "grad_norm": 1.859129267311832, "kl": 0.03448486328125, "learning_rate": 9.63134765625e-07, "loss": 0.0014, "reward": 1.6312952637672424, "reward_std": 0.08252920210361481, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6312953531742096, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 349.59375, "epoch": 0.14794921875, "grad_norm": 1.5493611450032359, "kl": 0.02734375, "learning_rate": 9.630126953125e-07, "loss": 0.0011, "reward": 1.7147611379623413, "reward_std": 0.08944166824221611, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7225736379623413, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 346.2421875, "epoch": 0.1484375, "grad_norm": 1.8976423626253172, "kl": 0.03570556640625, "learning_rate": 9.62890625e-07, "loss": 0.0014, "reward": 1.6145520210266113, "reward_std": 0.18844667822122574, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6614269018173218, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 200.640625, "epoch": 0.14892578125, "grad_norm": 7.425501960614286, "kl": 0.047607421875, "learning_rate": 9.627685546875e-07, "loss": 0.0019, "reward": 1.7208858728408813, "reward_std": 0.1330663561820984, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7208858132362366, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 333.6484375, "epoch": 0.1494140625, "grad_norm": 2.769163068983383, "kl": 0.03759765625, "learning_rate": 9.626464843749999e-07, "loss": 0.0015, "reward": 1.5909721851348877, "reward_std": 0.21000181138515472, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6144096851348877, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 226.984375, "epoch": 0.14990234375, "grad_norm": 5.7395657037691326, "kl": 0.03375244140625, "learning_rate": 9.625244140625e-07, "loss": 0.0013, "reward": 1.6622443199157715, "reward_std": 0.03923766687512398, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6622443348169327, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 462.1328125, "epoch": 0.150390625, "grad_norm": 2.6717724620758663, "kl": 0.02557373046875, "learning_rate": 9.6240234375e-07, "loss": 0.001, "reward": 1.5063217282295227, "reward_std": 0.20277608931064606, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.5531966686248779, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 315.0234375, "epoch": 0.15087890625, "grad_norm": 2.18014699586722, "kl": 0.041748046875, "learning_rate": 9.622802734375e-07, "loss": 0.0017, "reward": 1.6508269906044006, "reward_std": 0.13892033696174622, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.697702020406723, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 342.9765625, "epoch": 0.1513671875, "grad_norm": 1.717478656404003, "kl": 0.0283203125, "learning_rate": 9.62158203125e-07, "loss": 0.0011, "reward": 1.6870404481887817, "reward_std": 0.06977767683565617, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6870404779911041, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 328.4765625, "epoch": 0.15185546875, "grad_norm": 1.7043876842099035, "kl": 0.0340576171875, "learning_rate": 9.620361328124999e-07, "loss": 0.0014, "reward": 1.5702768564224243, "reward_std": 0.15926361829042435, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5859018266201019, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 336.2109375, "epoch": 0.15234375, "grad_norm": 2.125053862773254, "kl": 0.028076171875, "learning_rate": 9.619140625e-07, "loss": 0.0011, "reward": 1.6080606579780579, "reward_std": 0.14491120725870132, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6471231281757355, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 323.90625, "epoch": 0.15283203125, "grad_norm": 3.1096892974164425, "kl": 0.036865234375, "learning_rate": 9.617919921874999e-07, "loss": 0.0015, "reward": 1.5795653462409973, "reward_std": 0.11042843386530876, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5795653164386749, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 250.59375, "epoch": 0.1533203125, "grad_norm": 3.5995680052886527, "kl": 0.0377197265625, "learning_rate": 9.61669921875e-07, "loss": 0.0015, "reward": 1.646964430809021, "reward_std": 0.12394942343235016, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.646964430809021, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 271.0390625, "epoch": 0.15380859375, "grad_norm": 1.956255504508322, "kl": 0.0301513671875, "learning_rate": 9.615478515625e-07, "loss": 0.0012, "reward": 1.7696388363838196, "reward_std": 0.05953131802380085, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7696388363838196, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 337.515625, "epoch": 0.154296875, "grad_norm": 5.191246910967392, "kl": 0.03155517578125, "learning_rate": 9.6142578125e-07, "loss": 0.0013, "reward": 1.5658961534500122, "reward_std": 0.12328368425369263, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6205836087465286, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 207.1796875, "epoch": 0.15478515625, "grad_norm": 4.500450512155949, "kl": 0.03509521484375, "learning_rate": 9.613037109375e-07, "loss": 0.0014, "reward": 1.6759998798370361, "reward_std": 0.100888442248106, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6759998500347137, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 403.234375, "epoch": 0.1552734375, "grad_norm": 3.2796857668648842, "kl": 0.0269775390625, "learning_rate": 9.611816406249999e-07, "loss": 0.0011, "reward": 1.6215779781341553, "reward_std": 0.1620483510196209, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6684529185295105, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 301.2578125, "epoch": 0.15576171875, "grad_norm": 2.7662804735100517, "kl": 0.03497314453125, "learning_rate": 9.610595703125e-07, "loss": 0.0014, "reward": 1.6629568934440613, "reward_std": 0.14340640604496002, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.670769453048706, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 326.8671875, "epoch": 0.15625, "grad_norm": 4.542442828253781, "kl": 0.0322265625, "learning_rate": 9.609374999999999e-07, "loss": 0.0013, "reward": 1.711995244026184, "reward_std": 0.19287973642349243, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7510578036308289, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 300.2265625, "epoch": 0.15673828125, "grad_norm": 4.052768372311868, "kl": 0.0286865234375, "learning_rate": 9.608154296875e-07, "loss": 0.0011, "reward": 1.6291555762290955, "reward_std": 0.11671308055520058, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6525930762290955, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 335.5390625, "epoch": 0.1572265625, "grad_norm": 2.162537030435017, "kl": 0.0362548828125, "learning_rate": 9.60693359375e-07, "loss": 0.0014, "reward": 1.6343209147453308, "reward_std": 0.16108915954828262, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6499459147453308, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 321.09375, "epoch": 0.15771484375, "grad_norm": 1.856325185116223, "kl": 0.0341796875, "learning_rate": 9.605712890625e-07, "loss": 0.0014, "reward": 1.7311798930168152, "reward_std": 0.06938901171088219, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7311798632144928, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 320.4296875, "epoch": 0.158203125, "grad_norm": 5.005867500611158, "kl": 0.0361328125, "learning_rate": 9.6044921875e-07, "loss": 0.0014, "reward": 1.526106595993042, "reward_std": 0.20226696878671646, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.5886066257953644, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 290.2109375, "epoch": 0.15869140625, "grad_norm": 1.6834488335758562, "kl": 0.0377197265625, "learning_rate": 9.603271484374999e-07, "loss": 0.0015, "reward": 1.7446966171264648, "reward_std": 0.09506701678037643, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7446966171264648, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 380.0234375, "epoch": 0.1591796875, "grad_norm": 34.72900439154182, "kl": 0.02752685546875, "learning_rate": 9.60205078125e-07, "loss": 0.0011, "reward": 1.6595964431762695, "reward_std": 0.16007909923791885, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6752214133739471, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 292.4609375, "epoch": 0.15966796875, "grad_norm": 4.142586916976736, "kl": 0.0384521484375, "learning_rate": 9.600830078125e-07, "loss": 0.0015, "reward": 1.7855232954025269, "reward_std": 0.13429051637649536, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7933357656002045, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 212.6796875, "epoch": 0.16015625, "grad_norm": 2.0417286955446574, "kl": 0.0391845703125, "learning_rate": 9.599609375e-07, "loss": 0.0016, "reward": 1.7793214321136475, "reward_std": 0.05697181820869446, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7793213725090027, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 438.84375, "epoch": 0.16064453125, "grad_norm": 1.4353302978671976, "kl": 0.0302734375, "learning_rate": 9.598388671875e-07, "loss": 0.0012, "reward": 1.6387850642204285, "reward_std": 0.37150806188583374, "rewards/format_reward": 0.90625, "rewards/ocr_reward": 0.7325350046157837, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 255.1328125, "epoch": 0.1611328125, "grad_norm": 3.1073764344455337, "kl": 0.0394287109375, "learning_rate": 9.59716796875e-07, "loss": 0.0016, "reward": 1.5094847083091736, "reward_std": 0.13999176025390625, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.5719846189022064, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 392.53125, "epoch": 0.16162109375, "grad_norm": 1.3469828802315056, "kl": 0.02386474609375, "learning_rate": 9.595947265625e-07, "loss": 0.001, "reward": 1.7278481125831604, "reward_std": 0.1803218349814415, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7747230529785156, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 342.1328125, "epoch": 0.162109375, "grad_norm": 2.4061717119122776, "kl": 0.03167724609375, "learning_rate": 9.594726562499999e-07, "loss": 0.0013, "reward": 1.7532138228416443, "reward_std": 0.13944057375192642, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7610263526439667, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 291.578125, "epoch": 0.16259765625, "grad_norm": 1.5063714645040478, "kl": 0.03363037109375, "learning_rate": 9.593505859375e-07, "loss": 0.0013, "reward": 1.6756377220153809, "reward_std": 0.06455008871853352, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6756377518177032, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 372.3125, "epoch": 0.1630859375, "grad_norm": 1.3347945227728137, "kl": 0.02984619140625, "learning_rate": 9.59228515625e-07, "loss": 0.0012, "reward": 1.7514132857322693, "reward_std": 0.09506377205252647, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7592257857322693, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 281.109375, "epoch": 0.16357421875, "grad_norm": 3.723407278968701, "kl": 0.0455322265625, "learning_rate": 9.591064453125e-07, "loss": 0.0018, "reward": 1.6376798152923584, "reward_std": 0.15414723008871078, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6767423748970032, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 342.53125, "epoch": 0.1640625, "grad_norm": 3.9448517340622655, "kl": 0.031494140625, "learning_rate": 9.58984375e-07, "loss": 0.0013, "reward": 1.668643057346344, "reward_std": 0.07998159155249596, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6686430275440216, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 360.0390625, "epoch": 0.16455078125, "grad_norm": 11.966779228153586, "kl": 0.0382080078125, "learning_rate": 9.588623046875e-07, "loss": 0.0015, "reward": 1.6435166597366333, "reward_std": 0.13468455523252487, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6903917491436005, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 231.390625, "epoch": 0.1650390625, "grad_norm": 1.3026863061956178, "kl": 0.042724609375, "learning_rate": 9.58740234375e-07, "loss": 0.0017, "reward": 1.6170286536216736, "reward_std": 0.03771189600229263, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6170286238193512, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 410.6484375, "epoch": 0.16552734375, "grad_norm": 1.7776160609392315, "kl": 0.029541015625, "learning_rate": 9.586181640624999e-07, "loss": 0.0012, "reward": 1.6082661151885986, "reward_std": 0.16181888803839684, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6238911151885986, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 320.3515625, "epoch": 0.166015625, "grad_norm": 3.4344192664071636, "kl": 0.0374755859375, "learning_rate": 9.5849609375e-07, "loss": 0.0015, "reward": 1.6772453784942627, "reward_std": 0.09790786355733871, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7241203486919403, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 252.125, "epoch": 0.16650390625, "grad_norm": 3.4863206189382785, "kl": 0.038330078125, "learning_rate": 9.583740234375e-07, "loss": 0.0015, "reward": 1.7005472779273987, "reward_std": 0.09716508537530899, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7083597481250763, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 337.265625, "epoch": 0.1669921875, "grad_norm": 16.588569687572924, "kl": 0.03466796875, "learning_rate": 9.58251953125e-07, "loss": 0.0014, "reward": 1.6174096465110779, "reward_std": 0.11772006377577782, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6330346167087555, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 344.96875, "epoch": 0.16748046875, "grad_norm": 3.315958682715951, "kl": 0.03314208984375, "learning_rate": 9.581298828125e-07, "loss": 0.0013, "reward": 1.5178037285804749, "reward_std": 0.1745915710926056, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5490537583827972, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 312.5, "epoch": 0.16796875, "grad_norm": 2.2349202653555365, "kl": 0.0390625, "learning_rate": 9.580078124999999e-07, "loss": 0.0016, "reward": 1.6745514273643494, "reward_std": 0.1728959158062935, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.721426397562027, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 257.84375, "epoch": 0.16845703125, "grad_norm": 2.8256080294771637, "kl": 0.04638671875, "learning_rate": 9.578857421875e-07, "loss": 0.0019, "reward": 1.7369277477264404, "reward_std": 0.05663881450891495, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7369276583194733, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 295.5703125, "epoch": 0.1689453125, "grad_norm": 2.627531421994624, "kl": 0.0396728515625, "learning_rate": 9.577636718749999e-07, "loss": 0.0016, "reward": 1.5510008335113525, "reward_std": 0.17610786110162735, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5822509080171585, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 379.6015625, "epoch": 0.16943359375, "grad_norm": 3.1216112029414482, "kl": 0.03594970703125, "learning_rate": 9.576416015625e-07, "loss": 0.0014, "reward": 1.7094944715499878, "reward_std": 0.08010836690664291, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7094944417476654, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 368.359375, "epoch": 0.169921875, "grad_norm": 2.0161100389850617, "kl": 0.0460205078125, "learning_rate": 9.5751953125e-07, "loss": 0.0018, "reward": 1.769561767578125, "reward_std": 0.03940633311867714, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.769561767578125, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 381.3203125, "epoch": 0.17041015625, "grad_norm": 1.678953605120237, "kl": 0.02923583984375, "learning_rate": 9.573974609375e-07, "loss": 0.0012, "reward": 1.7712068557739258, "reward_std": 0.1021023616194725, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7946443557739258, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 376.421875, "epoch": 0.1708984375, "grad_norm": 3.911611150477222, "kl": 0.0369873046875, "learning_rate": 9.57275390625e-07, "loss": 0.0015, "reward": 1.5145609378814697, "reward_std": 0.2088497430086136, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.5614359080791473, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 296.7109375, "epoch": 0.17138671875, "grad_norm": 6.385932586698964, "kl": 0.035400390625, "learning_rate": 9.571533203124999e-07, "loss": 0.0014, "reward": 1.5890177488327026, "reward_std": 0.0945354737341404, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5890178084373474, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 344.5625, "epoch": 0.171875, "grad_norm": 1.6280503492839655, "kl": 0.037353515625, "learning_rate": 9.5703125e-07, "loss": 0.0015, "reward": 1.6745615005493164, "reward_std": 0.10443703085184097, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6823740601539612, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 414.9921875, "epoch": 0.17236328125, "grad_norm": 1.5117691402769504, "kl": 0.03424072265625, "learning_rate": 9.569091796875e-07, "loss": 0.0014, "reward": 1.6382949948310852, "reward_std": 0.20788590610027313, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.66173255443573, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 312.6171875, "epoch": 0.1728515625, "grad_norm": 3.7063712081293416, "kl": 0.0482177734375, "learning_rate": 9.56787109375e-07, "loss": 0.0019, "reward": 1.6848008632659912, "reward_std": 0.13139459863305092, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.684800922870636, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 319.5625, "epoch": 0.17333984375, "grad_norm": 1.5391314099083317, "kl": 0.040283203125, "learning_rate": 9.566650390625e-07, "loss": 0.0016, "reward": 1.6223503947257996, "reward_std": 0.1576566994190216, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6301629543304443, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 399.78125, "epoch": 0.173828125, "grad_norm": 2.7058681092420795, "kl": 0.0372314453125, "learning_rate": 9.5654296875e-07, "loss": 0.0015, "reward": 1.525748610496521, "reward_std": 0.19213548302650452, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5491860806941986, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 299.0390625, "epoch": 0.17431640625, "grad_norm": 2.0154711074208773, "kl": 0.0439453125, "learning_rate": 9.564208984375e-07, "loss": 0.0018, "reward": 1.7262452840805054, "reward_std": 0.11205626837909222, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.726245254278183, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 346.6328125, "epoch": 0.1748046875, "grad_norm": 3.2566800818643813, "kl": 0.036865234375, "learning_rate": 9.562988281249999e-07, "loss": 0.0015, "reward": 1.5986173748970032, "reward_std": 0.17809632420539856, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6142423450946808, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 273.9765625, "epoch": 0.17529296875, "grad_norm": 4.6715149905690545, "kl": 0.0439453125, "learning_rate": 9.561767578125e-07, "loss": 0.0018, "reward": 1.685244619846344, "reward_std": 0.07497452571988106, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.685244619846344, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 269.46875, "epoch": 0.17578125, "grad_norm": 2.105222847919174, "kl": 0.0450439453125, "learning_rate": 9.560546875e-07, "loss": 0.0018, "reward": 1.6982702612876892, "reward_std": 0.17531277611851692, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7138952612876892, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 143.984375, "epoch": 0.17626953125, "grad_norm": 5.232570191497886, "kl": 0.0435791015625, "learning_rate": 9.559326171875e-07, "loss": 0.0017, "reward": 1.7132031321525574, "reward_std": 0.1074238047003746, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7132031321525574, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 300.71875, "epoch": 0.1767578125, "grad_norm": 2.1497345593947985, "kl": 0.0511474609375, "learning_rate": 9.55810546875e-07, "loss": 0.002, "reward": 1.4040643572807312, "reward_std": 0.08128705434501171, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.4040642976760864, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 308.515625, "epoch": 0.17724609375, "grad_norm": 8.38599033866768, "kl": 0.04931640625, "learning_rate": 9.556884765625e-07, "loss": 0.002, "reward": 1.6172441244125366, "reward_std": 0.10681581497192383, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6172442138195038, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 268.625, "epoch": 0.177734375, "grad_norm": 1.767290932124583, "kl": 0.0501708984375, "learning_rate": 9.5556640625e-07, "loss": 0.002, "reward": 1.6871461868286133, "reward_std": 0.060712188482284546, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6871461272239685, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 267.125, "epoch": 0.17822265625, "grad_norm": 2.9068478143017344, "kl": 0.0465087890625, "learning_rate": 9.554443359374999e-07, "loss": 0.0019, "reward": 1.721463680267334, "reward_std": 0.0778956264257431, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7214637100696564, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 324.359375, "epoch": 0.1787109375, "grad_norm": 5.5157852848407245, "kl": 0.0372314453125, "learning_rate": 9.55322265625e-07, "loss": 0.0015, "reward": 1.7536060810089111, "reward_std": 0.10080629587173462, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7536060810089111, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 296.875, "epoch": 0.17919921875, "grad_norm": 2.984431123069507, "kl": 0.0540771484375, "learning_rate": 9.552001953125e-07, "loss": 0.0022, "reward": 1.6196495294570923, "reward_std": 0.10086812451481819, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6196494698524475, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 190.8046875, "epoch": 0.1796875, "grad_norm": 3.157195401410498, "kl": 0.063720703125, "learning_rate": 9.55078125e-07, "loss": 0.0025, "reward": 1.715992033481598, "reward_std": 0.1297970972955227, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7238045334815979, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 241.09375, "epoch": 0.18017578125, "grad_norm": 2.324085860520846, "kl": 0.070556640625, "learning_rate": 9.549560546875e-07, "loss": 0.0028, "reward": 1.700922667980194, "reward_std": 0.08325351774692535, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7009226679801941, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 332.234375, "epoch": 0.1806640625, "grad_norm": 2.004151009626354, "kl": 0.0498046875, "learning_rate": 9.548339843749999e-07, "loss": 0.002, "reward": 1.6857663989067078, "reward_std": 0.1576274000108242, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6935788691043854, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 307.8359375, "epoch": 0.18115234375, "grad_norm": 2.404587038530015, "kl": 0.046875, "learning_rate": 9.547119140625e-07, "loss": 0.0019, "reward": 1.6597256660461426, "reward_std": 0.13613457418978214, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6831631660461426, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 314.0703125, "epoch": 0.181640625, "grad_norm": 2.834868853183062, "kl": 0.0577392578125, "learning_rate": 9.545898437499999e-07, "loss": 0.0023, "reward": 1.5693495869636536, "reward_std": 0.14352120459079742, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5771620869636536, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 246.40625, "epoch": 0.18212890625, "grad_norm": 9.040737722489206, "kl": 0.044921875, "learning_rate": 9.544677734375e-07, "loss": 0.0018, "reward": 1.801272690296173, "reward_std": 0.05337041616439819, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8012726902961731, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 313.078125, "epoch": 0.1826171875, "grad_norm": 3.5600976807232554, "kl": 0.0465087890625, "learning_rate": 9.54345703125e-07, "loss": 0.0019, "reward": 1.453054428100586, "reward_std": 0.1263410821557045, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.4608669579029083, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 291.171875, "epoch": 0.18310546875, "grad_norm": 2.002398563178058, "kl": 0.0484619140625, "learning_rate": 9.542236328125e-07, "loss": 0.0019, "reward": 1.6627238988876343, "reward_std": 0.07443033531308174, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6705364286899567, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 286.7890625, "epoch": 0.18359375, "grad_norm": 2.757249827207943, "kl": 0.050537109375, "learning_rate": 9.541015625e-07, "loss": 0.002, "reward": 1.6889954805374146, "reward_std": 0.08430779352784157, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6889954209327698, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 354.2265625, "epoch": 0.18408203125, "grad_norm": 4.5073105256951775, "kl": 0.0504150390625, "learning_rate": 9.539794921874999e-07, "loss": 0.002, "reward": 1.5870369672775269, "reward_std": 0.10734122432768345, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6182869672775269, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 416.421875, "epoch": 0.1845703125, "grad_norm": 7.196357548568271, "kl": 0.037109375, "learning_rate": 9.53857421875e-07, "loss": 0.0015, "reward": 1.6069696545600891, "reward_std": 0.2218686118721962, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.6772821247577667, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 280.28125, "epoch": 0.18505859375, "grad_norm": 4.45308500198343, "kl": 0.058837890625, "learning_rate": 9.537353515625e-07, "loss": 0.0024, "reward": 1.7543954253196716, "reward_std": 0.06126508302986622, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7543954253196716, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 357.6171875, "epoch": 0.185546875, "grad_norm": 1.5466264285633915, "kl": 0.046142578125, "learning_rate": 9.536132812499999e-07, "loss": 0.0018, "reward": 1.6366318464279175, "reward_std": 0.1529180034995079, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6600694358348846, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 342.171875, "epoch": 0.18603515625, "grad_norm": 1.5074271098619745, "kl": 0.0447998046875, "learning_rate": 9.534912109374999e-07, "loss": 0.0018, "reward": 1.6499249935150146, "reward_std": 0.15157188847661018, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6889875531196594, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 374.7890625, "epoch": 0.1865234375, "grad_norm": 2.7697954920464434, "kl": 0.051513671875, "learning_rate": 9.533691406249999e-07, "loss": 0.0021, "reward": 1.5470696091651917, "reward_std": 0.2721578925848007, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.593944638967514, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 274.8046875, "epoch": 0.18701171875, "grad_norm": 3.8213692120277054, "kl": 0.06005859375, "learning_rate": 9.532470703125e-07, "loss": 0.0024, "reward": 1.6255079507827759, "reward_std": 0.21495968848466873, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6567580103874207, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 411.8671875, "epoch": 0.1875, "grad_norm": 4.4450768164862335, "kl": 0.046630859375, "learning_rate": 9.53125e-07, "loss": 0.0019, "reward": 1.6321772336959839, "reward_std": 0.2580869309604168, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.7103022634983063, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 337.0703125, "epoch": 0.18798828125, "grad_norm": 2.000719653156199, "kl": 0.0751953125, "learning_rate": 9.530029296875e-07, "loss": 0.003, "reward": 1.6403818130493164, "reward_std": 0.18112845346331596, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6794443130493164, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 315.9375, "epoch": 0.1884765625, "grad_norm": 1.2925552797273454, "kl": 0.0565185546875, "learning_rate": 9.52880859375e-07, "loss": 0.0023, "reward": 1.7097843885421753, "reward_std": 0.08790682628750801, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7332218289375305, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 311.125, "epoch": 0.18896484375, "grad_norm": 4.476596386367477, "kl": 0.0555419921875, "learning_rate": 9.527587890624999e-07, "loss": 0.0022, "reward": 1.6251919269561768, "reward_std": 0.1653646007180214, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6408169269561768, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 289.25, "epoch": 0.189453125, "grad_norm": 1.981834575248848, "kl": 0.0638427734375, "learning_rate": 9.526367187499999e-07, "loss": 0.0026, "reward": 1.7350217700004578, "reward_std": 0.08974255621433258, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7428342998027802, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 367.265625, "epoch": 0.18994140625, "grad_norm": 3.714264090994581, "kl": 0.0496826171875, "learning_rate": 9.525146484375e-07, "loss": 0.002, "reward": 1.5587335228919983, "reward_std": 0.1307937055826187, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5743584930896759, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 278.8828125, "epoch": 0.1904296875, "grad_norm": 0.7910658209647898, "kl": 0.0504150390625, "learning_rate": 9.52392578125e-07, "loss": 0.002, "reward": 1.738577127456665, "reward_std": 0.05491543561220169, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7463896572589874, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 221.625, "epoch": 0.19091796875, "grad_norm": 1.412501157568577, "kl": 0.0592041015625, "learning_rate": 9.522705078125e-07, "loss": 0.0024, "reward": 1.6673744916915894, "reward_std": 0.04720168560743332, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6673744320869446, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 312.765625, "epoch": 0.19140625, "grad_norm": 2.091929522012718, "kl": 0.04931640625, "learning_rate": 9.521484375e-07, "loss": 0.002, "reward": 1.6497448682785034, "reward_std": 0.17027802020311356, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6731823682785034, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 340.2265625, "epoch": 0.19189453125, "grad_norm": 1.1250339673162928, "kl": 0.0487060546875, "learning_rate": 9.520263671874999e-07, "loss": 0.0019, "reward": 1.6197129487991333, "reward_std": 0.15943622216582298, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6587753891944885, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 391.3515625, "epoch": 0.1923828125, "grad_norm": 2.6862482142648543, "kl": 0.041015625, "learning_rate": 9.519042968749999e-07, "loss": 0.0016, "reward": 1.7228458523750305, "reward_std": 0.06821495667099953, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7306584417819977, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 313.046875, "epoch": 0.19287109375, "grad_norm": 2.295224742756099, "kl": 0.0504150390625, "learning_rate": 9.517822265624999e-07, "loss": 0.002, "reward": 1.712727427482605, "reward_std": 0.13450950384140015, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.720539927482605, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 355.515625, "epoch": 0.193359375, "grad_norm": 1.949548832486549, "kl": 0.0498046875, "learning_rate": 9.5166015625e-07, "loss": 0.002, "reward": 1.5369553565979004, "reward_std": 0.21717742085456848, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.607267826795578, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 244.125, "epoch": 0.19384765625, "grad_norm": 3.751089819487828, "kl": 0.064697265625, "learning_rate": 9.515380859375e-07, "loss": 0.0026, "reward": 1.7010602951049805, "reward_std": 0.1037181131541729, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7010602951049805, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 427.109375, "epoch": 0.1943359375, "grad_norm": 2.0876288807152616, "kl": 0.041748046875, "learning_rate": 9.51416015625e-07, "loss": 0.0017, "reward": 1.6480942964553833, "reward_std": 0.1627689152956009, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6871567964553833, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 233.9453125, "epoch": 0.19482421875, "grad_norm": 3.7142502808168616, "kl": 0.0523681640625, "learning_rate": 9.512939453125e-07, "loss": 0.0021, "reward": 1.6217145919799805, "reward_std": 0.06836835853755474, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6217146515846252, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 229.140625, "epoch": 0.1953125, "grad_norm": 3.234762690222564, "kl": 0.056640625, "learning_rate": 9.511718749999999e-07, "loss": 0.0023, "reward": 1.7640219926834106, "reward_std": 0.12531143426895142, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7718344628810883, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 269.7109375, "epoch": 0.19580078125, "grad_norm": 2.2580241394701956, "kl": 0.05419921875, "learning_rate": 9.510498046874999e-07, "loss": 0.0022, "reward": 1.6812456250190735, "reward_std": 0.11042129248380661, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6890580952167511, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 337.1015625, "epoch": 0.1962890625, "grad_norm": 2.113374918135095, "kl": 0.0408935546875, "learning_rate": 9.50927734375e-07, "loss": 0.0016, "reward": 1.7110391855239868, "reward_std": 0.08690160885453224, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7110391855239868, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 343.28125, "epoch": 0.19677734375, "grad_norm": 2.854663005537301, "kl": 0.0523681640625, "learning_rate": 9.508056640625e-07, "loss": 0.0021, "reward": 1.698850393295288, "reward_std": 0.16052530705928802, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7144753634929657, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 259.90625, "epoch": 0.197265625, "grad_norm": 3.566015688964678, "kl": 0.05322265625, "learning_rate": 9.5068359375e-07, "loss": 0.0021, "reward": 1.611766278743744, "reward_std": 0.1618807651102543, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6352038085460663, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 304.7109375, "epoch": 0.19775390625, "grad_norm": 27.087108641742997, "kl": 0.0535888671875, "learning_rate": 9.505615234375e-07, "loss": 0.0021, "reward": 1.5200156569480896, "reward_std": 0.19639131426811218, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5512656569480896, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 280.78125, "epoch": 0.1982421875, "grad_norm": 2.142097886894366, "kl": 0.0496826171875, "learning_rate": 9.504394531249999e-07, "loss": 0.002, "reward": 1.6612927317619324, "reward_std": 0.15089121460914612, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6925427317619324, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 280.6484375, "epoch": 0.19873046875, "grad_norm": 4.366971614094934, "kl": 0.0616455078125, "learning_rate": 9.503173828124999e-07, "loss": 0.0025, "reward": 1.6004191040992737, "reward_std": 0.17288047075271606, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6160440444946289, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 319.625, "epoch": 0.19921875, "grad_norm": 1.7604823561903244, "kl": 0.058837890625, "learning_rate": 9.501953124999999e-07, "loss": 0.0024, "reward": 1.811613917350769, "reward_std": 0.10434301942586899, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.819426417350769, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 242.203125, "epoch": 0.19970703125, "grad_norm": 9.878985501778747, "kl": 0.041015625, "learning_rate": 9.500732421875e-07, "loss": 0.0016, "reward": 1.7147305607795715, "reward_std": 0.14838684350252151, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7225430309772491, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 271.2421875, "epoch": 0.2001953125, "grad_norm": 1.1168993817133859, "kl": 0.0477294921875, "learning_rate": 9.49951171875e-07, "loss": 0.0019, "reward": 1.6048610210418701, "reward_std": 0.06566739082336426, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6048609614372253, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 305.0703125, "epoch": 0.20068359375, "grad_norm": 2.4027814049067366, "kl": 0.0513916015625, "learning_rate": 9.498291015625e-07, "loss": 0.0021, "reward": 1.5927820801734924, "reward_std": 0.11115045472979546, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6318445801734924, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 267.1796875, "epoch": 0.201171875, "grad_norm": 2.2150494756825045, "kl": 0.0562744140625, "learning_rate": 9.4970703125e-07, "loss": 0.0022, "reward": 1.6925803422927856, "reward_std": 0.12557360157370567, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7082052826881409, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 326.4140625, "epoch": 0.20166015625, "grad_norm": 10.234390592196027, "kl": 0.0421142578125, "learning_rate": 9.495849609374999e-07, "loss": 0.0017, "reward": 1.625154733657837, "reward_std": 0.1099303588271141, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6485922932624817, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 350.53125, "epoch": 0.2021484375, "grad_norm": 1.8803371479808038, "kl": 0.0467529296875, "learning_rate": 9.494628906249999e-07, "loss": 0.0019, "reward": 1.7836476564407349, "reward_std": 0.0976153276860714, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7914601564407349, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 314.1484375, "epoch": 0.20263671875, "grad_norm": 2.530860829474323, "kl": 0.0577392578125, "learning_rate": 9.493408203125e-07, "loss": 0.0023, "reward": 1.6098762154579163, "reward_std": 0.23471946269273758, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.6880012154579163, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 292.828125, "epoch": 0.203125, "grad_norm": 4.285692563206467, "kl": 0.0509033203125, "learning_rate": 9.4921875e-07, "loss": 0.002, "reward": 1.609405517578125, "reward_std": 0.10902727395296097, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6094054579734802, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 296.2890625, "epoch": 0.20361328125, "grad_norm": 1.8103328712037055, "kl": 0.049560546875, "learning_rate": 9.490966796875e-07, "loss": 0.002, "reward": 1.5358877182006836, "reward_std": 0.08052598685026169, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5437001585960388, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 238.4609375, "epoch": 0.2041015625, "grad_norm": 2.3123346346362452, "kl": 0.0615234375, "learning_rate": 9.48974609375e-07, "loss": 0.0025, "reward": 1.5540345907211304, "reward_std": 0.11965020000934601, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5540346205234528, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 217.8515625, "epoch": 0.20458984375, "grad_norm": 2.17850912465939, "kl": 0.054443359375, "learning_rate": 9.488525390624999e-07, "loss": 0.0022, "reward": 1.7468233108520508, "reward_std": 0.07044094800949097, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7468233108520508, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 293.2578125, "epoch": 0.205078125, "grad_norm": 2.351303220308625, "kl": 0.0440673828125, "learning_rate": 9.487304687499999e-07, "loss": 0.0018, "reward": 1.6957443952560425, "reward_std": 0.04969111829996109, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6957444846630096, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 318.0546875, "epoch": 0.20556640625, "grad_norm": 1.667715869495618, "kl": 0.05908203125, "learning_rate": 9.486083984374999e-07, "loss": 0.0024, "reward": 1.681714653968811, "reward_std": 0.14231722056865692, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6973395347595215, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 306.75, "epoch": 0.2060546875, "grad_norm": 6.464882822483457, "kl": 0.0556640625, "learning_rate": 9.48486328125e-07, "loss": 0.0022, "reward": 1.6321836113929749, "reward_std": 0.13830295950174332, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6399961411952972, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 231.7265625, "epoch": 0.20654296875, "grad_norm": 3.295724952577691, "kl": 0.0535888671875, "learning_rate": 9.483642578125e-07, "loss": 0.0021, "reward": 1.707470715045929, "reward_std": 0.1862129084765911, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.730908215045929, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 354.15625, "epoch": 0.20703125, "grad_norm": 2.167637361364238, "kl": 0.0465087890625, "learning_rate": 9.482421875e-07, "loss": 0.0019, "reward": 1.6849753856658936, "reward_std": 0.12426239252090454, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7162253856658936, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 292.203125, "epoch": 0.20751953125, "grad_norm": 2.0849894304046916, "kl": 0.04150390625, "learning_rate": 9.481201171875e-07, "loss": 0.0017, "reward": 1.6421186923980713, "reward_std": 0.13160578161478043, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6655561625957489, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 235.25, "epoch": 0.2080078125, "grad_norm": 2.0360917274571073, "kl": 0.062255859375, "learning_rate": 9.479980468749999e-07, "loss": 0.0025, "reward": 1.8281482458114624, "reward_std": 0.0754449162632227, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.82814821600914, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 242.7578125, "epoch": 0.20849609375, "grad_norm": 2.2356720906958594, "kl": 0.0491943359375, "learning_rate": 9.478759765624999e-07, "loss": 0.002, "reward": 1.7281526327133179, "reward_std": 0.08592578768730164, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7281526327133179, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 241.8203125, "epoch": 0.208984375, "grad_norm": 2.2370765591210873, "kl": 0.0584716796875, "learning_rate": 9.4775390625e-07, "loss": 0.0023, "reward": 1.6686657667160034, "reward_std": 0.12740540876984596, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6842906475067139, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 301.578125, "epoch": 0.20947265625, "grad_norm": 1.6150656197994475, "kl": 0.0616455078125, "learning_rate": 9.476318359375e-07, "loss": 0.0025, "reward": 1.591238021850586, "reward_std": 0.06698063388466835, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5912379920482635, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 356.9296875, "epoch": 0.2099609375, "grad_norm": 1.5061424458539174, "kl": 0.035888671875, "learning_rate": 9.47509765625e-07, "loss": 0.0014, "reward": 1.71708744764328, "reward_std": 0.06687924265861511, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7170874178409576, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 243.96875, "epoch": 0.21044921875, "grad_norm": 4.580041027491469, "kl": 0.0445556640625, "learning_rate": 9.473876953125e-07, "loss": 0.0018, "reward": 1.7206860780715942, "reward_std": 0.14960038661956787, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7441235482692719, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 241.1171875, "epoch": 0.2109375, "grad_norm": 3.9447569556870925, "kl": 0.05615234375, "learning_rate": 9.472656249999999e-07, "loss": 0.0022, "reward": 1.6832043528556824, "reward_std": 0.08023593947291374, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6832043826580048, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 291.6953125, "epoch": 0.21142578125, "grad_norm": 2.547549175278579, "kl": 0.0560302734375, "learning_rate": 9.471435546874999e-07, "loss": 0.0022, "reward": 1.7101504802703857, "reward_std": 0.1703593209385872, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7257755398750305, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 291.125, "epoch": 0.2119140625, "grad_norm": 1.6164994536934667, "kl": 0.0482177734375, "learning_rate": 9.470214843749999e-07, "loss": 0.0019, "reward": 1.6205175518989563, "reward_std": 0.08653150871396065, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6283301115036011, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 293.953125, "epoch": 0.21240234375, "grad_norm": 2.1871293922270514, "kl": 0.0439453125, "learning_rate": 9.468994140625e-07, "loss": 0.0018, "reward": 1.7288724780082703, "reward_std": 0.12284732609987259, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7444974780082703, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 264.5, "epoch": 0.212890625, "grad_norm": 1.1688720954359133, "kl": 0.0484619140625, "learning_rate": 9.4677734375e-07, "loss": 0.0019, "reward": 1.769058644771576, "reward_std": 0.07042321562767029, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7768711447715759, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 253.4921875, "epoch": 0.21337890625, "grad_norm": 2.1010102368536674, "kl": 0.0406494140625, "learning_rate": 9.466552734375e-07, "loss": 0.0016, "reward": 1.750071406364441, "reward_std": 0.05832614004611969, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7500714361667633, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 270.3984375, "epoch": 0.2138671875, "grad_norm": 1.482992322234722, "kl": 0.041748046875, "learning_rate": 9.46533203125e-07, "loss": 0.0017, "reward": 1.7221877574920654, "reward_std": 0.04050422087311745, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.722187727689743, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 271.75, "epoch": 0.21435546875, "grad_norm": 3.5088817243048176, "kl": 0.0543212890625, "learning_rate": 9.464111328124999e-07, "loss": 0.0022, "reward": 1.7622933983802795, "reward_std": 0.10088678449392319, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7701059281826019, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 228.7578125, "epoch": 0.21484375, "grad_norm": 5.788688495019036, "kl": 0.0518798828125, "learning_rate": 9.462890624999999e-07, "loss": 0.0021, "reward": 1.5054885149002075, "reward_std": 0.09682680293917656, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5133009850978851, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 314.8984375, "epoch": 0.21533203125, "grad_norm": 2.1131575200131487, "kl": 0.04443359375, "learning_rate": 9.461669921875e-07, "loss": 0.0018, "reward": 1.6255145072937012, "reward_std": 0.11543078348040581, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6411395072937012, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 260.9140625, "epoch": 0.2158203125, "grad_norm": 2.3429544954821306, "kl": 0.044189453125, "learning_rate": 9.46044921875e-07, "loss": 0.0018, "reward": 1.8504613637924194, "reward_std": 0.06752173975110054, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8504613637924194, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 340.8125, "epoch": 0.21630859375, "grad_norm": 7.976580512227821, "kl": 0.0391845703125, "learning_rate": 9.459228515625e-07, "loss": 0.0016, "reward": 1.6511912941932678, "reward_std": 0.14625184237957, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6824413239955902, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 284.3203125, "epoch": 0.216796875, "grad_norm": 4.95319644301388, "kl": 0.0517578125, "learning_rate": 9.4580078125e-07, "loss": 0.0021, "reward": 1.7569758296012878, "reward_std": 0.1250363327562809, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7647883296012878, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 264.1875, "epoch": 0.21728515625, "grad_norm": 1.8993972344614596, "kl": 0.0367431640625, "learning_rate": 9.456787109374999e-07, "loss": 0.0015, "reward": 1.693526804447174, "reward_std": 0.11432855390012264, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7325893044471741, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 273.2734375, "epoch": 0.2177734375, "grad_norm": 7.777889583760453, "kl": 0.0439453125, "learning_rate": 9.455566406249999e-07, "loss": 0.0018, "reward": 1.6474227905273438, "reward_std": 0.11190011724829674, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6474227905273438, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 320.953125, "epoch": 0.21826171875, "grad_norm": 2.009296377838892, "kl": 0.03955078125, "learning_rate": 9.454345703124999e-07, "loss": 0.0016, "reward": 1.654877483844757, "reward_std": 0.11765236407518387, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6705024838447571, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 320.390625, "epoch": 0.21875, "grad_norm": 2.925176137880303, "kl": 0.0401611328125, "learning_rate": 9.453125e-07, "loss": 0.0016, "reward": 1.738932490348816, "reward_std": 0.06926981918513775, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7389324307441711, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 203.828125, "epoch": 0.21923828125, "grad_norm": 4.340531405543576, "kl": 0.0577392578125, "learning_rate": 9.451904296875e-07, "loss": 0.0023, "reward": 1.683348834514618, "reward_std": 0.0711992010474205, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6833488345146179, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 325.421875, "epoch": 0.2197265625, "grad_norm": 2.518125948148069, "kl": 0.04833984375, "learning_rate": 9.45068359375e-07, "loss": 0.0019, "reward": 1.6158209443092346, "reward_std": 0.11858320608735085, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6314459443092346, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 289.7265625, "epoch": 0.22021484375, "grad_norm": 3.5363930541538355, "kl": 0.044921875, "learning_rate": 9.449462890625e-07, "loss": 0.0018, "reward": 1.5984613299369812, "reward_std": 0.13294245302677155, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.60627381503582, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 319.515625, "epoch": 0.220703125, "grad_norm": 3.149130598742103, "kl": 0.0435791015625, "learning_rate": 9.448242187499999e-07, "loss": 0.0017, "reward": 1.6226680278778076, "reward_std": 0.112908735871315, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6226680278778076, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 336.9453125, "epoch": 0.22119140625, "grad_norm": 2.4935032422406125, "kl": 0.0389404296875, "learning_rate": 9.447021484374999e-07, "loss": 0.0016, "reward": 1.6436303853988647, "reward_std": 0.06870114244520664, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6436303853988647, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 237.171875, "epoch": 0.2216796875, "grad_norm": 3.6116771357404747, "kl": 0.042236328125, "learning_rate": 9.445800781249999e-07, "loss": 0.0017, "reward": 1.7960193157196045, "reward_std": 0.03523706644773483, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7960193157196045, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 217.28125, "epoch": 0.22216796875, "grad_norm": 1.8643747959172932, "kl": 0.0467529296875, "learning_rate": 9.444580078125e-07, "loss": 0.0019, "reward": 1.6899807453155518, "reward_std": 0.061588347889482975, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6899808049201965, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 289.3359375, "epoch": 0.22265625, "grad_norm": 2.378293437154049, "kl": 0.0528564453125, "learning_rate": 9.443359375e-07, "loss": 0.0021, "reward": 1.6194549202919006, "reward_std": 0.07477627880871296, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6272674798965454, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 288.7734375, "epoch": 0.22314453125, "grad_norm": 1.7628549042900312, "kl": 0.050537109375, "learning_rate": 9.442138671875e-07, "loss": 0.002, "reward": 1.6203702688217163, "reward_std": 0.06756623834371567, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6203702390193939, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 230.234375, "epoch": 0.2236328125, "grad_norm": 3.212039118876721, "kl": 0.0477294921875, "learning_rate": 9.440917968749999e-07, "loss": 0.0019, "reward": 1.7326794862747192, "reward_std": 0.15392906218767166, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7404920160770416, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 379.9609375, "epoch": 0.22412109375, "grad_norm": 2.20882934467843, "kl": 0.0362548828125, "learning_rate": 9.439697265624999e-07, "loss": 0.0015, "reward": 1.669608473777771, "reward_std": 0.1226998120546341, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6696084141731262, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 310.7109375, "epoch": 0.224609375, "grad_norm": 3.919575507206736, "kl": 0.037841796875, "learning_rate": 9.438476562499999e-07, "loss": 0.0015, "reward": 1.8161649107933044, "reward_std": 0.08291263319551945, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8161648809909821, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 302.1796875, "epoch": 0.22509765625, "grad_norm": 1.8813163144677836, "kl": 0.0386962890625, "learning_rate": 9.437255859375e-07, "loss": 0.0015, "reward": 1.5858674049377441, "reward_std": 0.10385648906230927, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5858674198389053, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 303.9921875, "epoch": 0.2255859375, "grad_norm": 3.0446438454409304, "kl": 0.04052734375, "learning_rate": 9.43603515625e-07, "loss": 0.0016, "reward": 1.6508355736732483, "reward_std": 0.13546227663755417, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6586481332778931, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 327.765625, "epoch": 0.22607421875, "grad_norm": 5.145994036718485, "kl": 0.0416259765625, "learning_rate": 9.434814453125e-07, "loss": 0.0017, "reward": 1.7009857892990112, "reward_std": 0.11110249161720276, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7009858191013336, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 217.203125, "epoch": 0.2265625, "grad_norm": 1.222277497239498, "kl": 0.052978515625, "learning_rate": 9.43359375e-07, "loss": 0.0021, "reward": 1.6271523833274841, "reward_std": 0.0367429880425334, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6271523833274841, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 232.828125, "epoch": 0.22705078125, "grad_norm": 1.372718735073741, "kl": 0.0382080078125, "learning_rate": 9.432373046874999e-07, "loss": 0.0015, "reward": 1.7885666489601135, "reward_std": 0.040253955870866776, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7885666787624359, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 250.0625, "epoch": 0.2275390625, "grad_norm": 1.7337186238761468, "kl": 0.057861328125, "learning_rate": 9.431152343749999e-07, "loss": 0.0023, "reward": 1.628583014011383, "reward_std": 0.12626324221491814, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6442080438137054, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 384.8203125, "epoch": 0.22802734375, "grad_norm": 2.1322772710330575, "kl": 0.042236328125, "learning_rate": 9.429931640624999e-07, "loss": 0.0017, "reward": 1.6795091032981873, "reward_std": 0.10694251582026482, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6951341331005096, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 379.109375, "epoch": 0.228515625, "grad_norm": 1.083227516677285, "kl": 0.039794921875, "learning_rate": 9.4287109375e-07, "loss": 0.0016, "reward": 1.5487976670265198, "reward_std": 0.12695813924074173, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5800476670265198, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 286.0078125, "epoch": 0.22900390625, "grad_norm": 2.134927472325478, "kl": 0.046875, "learning_rate": 9.427490234375e-07, "loss": 0.0019, "reward": 1.7062729597091675, "reward_std": 0.12723471224308014, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7140854597091675, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 363.5078125, "epoch": 0.2294921875, "grad_norm": 1.7045575082525455, "kl": 0.033935546875, "learning_rate": 9.42626953125e-07, "loss": 0.0014, "reward": 1.693819522857666, "reward_std": 0.07639718800783157, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.693819522857666, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 237.546875, "epoch": 0.22998046875, "grad_norm": 1.4199964051941236, "kl": 0.053466796875, "learning_rate": 9.425048828124999e-07, "loss": 0.0021, "reward": 1.7025293707847595, "reward_std": 0.03216167027130723, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7025294005870819, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 372.671875, "epoch": 0.23046875, "grad_norm": 2.2853712078997583, "kl": 0.040283203125, "learning_rate": 9.423828124999999e-07, "loss": 0.0016, "reward": 1.633117914199829, "reward_std": 0.1547449231147766, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6565554141998291, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 210.4375, "epoch": 0.23095703125, "grad_norm": 2.9606587430145748, "kl": 0.05859375, "learning_rate": 9.422607421874999e-07, "loss": 0.0023, "reward": 1.7061492204666138, "reward_std": 0.11958225071430206, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7139617204666138, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 311.4140625, "epoch": 0.2314453125, "grad_norm": 4.284919438683781, "kl": 0.0533447265625, "learning_rate": 9.42138671875e-07, "loss": 0.0021, "reward": 1.6417680978775024, "reward_std": 0.10428282991051674, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.65739306807518, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 346.8828125, "epoch": 0.23193359375, "grad_norm": 3.9023448946857533, "kl": 0.04266357421875, "learning_rate": 9.420166015625e-07, "loss": 0.0017, "reward": 1.7126132249832153, "reward_std": 0.09138727188110352, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7126132547855377, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 231.0859375, "epoch": 0.232421875, "grad_norm": 8.45607280183371, "kl": 0.052734375, "learning_rate": 9.4189453125e-07, "loss": 0.0021, "reward": 1.6888734102249146, "reward_std": 0.08186532184481621, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6888734102249146, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 338.390625, "epoch": 0.23291015625, "grad_norm": 3.7100837878450594, "kl": 0.04296875, "learning_rate": 9.417724609375e-07, "loss": 0.0017, "reward": 1.7216296195983887, "reward_std": 0.170655507594347, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7372545599937439, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 337.140625, "epoch": 0.2333984375, "grad_norm": 1.8396296874789484, "kl": 0.041748046875, "learning_rate": 9.416503906249999e-07, "loss": 0.0017, "reward": 1.6159728169441223, "reward_std": 0.19198870658874512, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6706602573394775, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 209.1015625, "epoch": 0.23388671875, "grad_norm": 3.539797062671013, "kl": 0.048583984375, "learning_rate": 9.415283203124999e-07, "loss": 0.0019, "reward": 1.7715952396392822, "reward_std": 0.07343994826078415, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7715952396392822, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 306.09375, "epoch": 0.234375, "grad_norm": 2.857203091318844, "kl": 0.0416259765625, "learning_rate": 9.414062499999999e-07, "loss": 0.0017, "reward": 1.706653356552124, "reward_std": 0.09830702841281891, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7066532969474792, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 340.140625, "epoch": 0.23486328125, "grad_norm": 10.240794373354676, "kl": 0.04052734375, "learning_rate": 9.412841796875e-07, "loss": 0.0016, "reward": 1.755756914615631, "reward_std": 0.07962564006447792, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7557569742202759, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 291.9921875, "epoch": 0.2353515625, "grad_norm": 4.440714955855732, "kl": 0.046630859375, "learning_rate": 9.41162109375e-07, "loss": 0.0019, "reward": 1.588149607181549, "reward_std": 0.07681831158697605, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5959621071815491, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 369.28125, "epoch": 0.23583984375, "grad_norm": 2.0909539375914736, "kl": 0.03387451171875, "learning_rate": 9.410400390625e-07, "loss": 0.0014, "reward": 1.768026053905487, "reward_std": 0.1362891048192978, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7836510539054871, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 260.625, "epoch": 0.236328125, "grad_norm": 4.002760763893457, "kl": 0.0521240234375, "learning_rate": 9.4091796875e-07, "loss": 0.0021, "reward": 1.7147894501686096, "reward_std": 0.08333645388484001, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.714789479970932, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 278.8984375, "epoch": 0.23681640625, "grad_norm": 2.2903477892017756, "kl": 0.046875, "learning_rate": 9.407958984374999e-07, "loss": 0.0019, "reward": 1.7409107685089111, "reward_std": 0.04808063432574272, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7409107685089111, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 284.4140625, "epoch": 0.2373046875, "grad_norm": 2.3093773994918347, "kl": 0.0355224609375, "learning_rate": 9.406738281249999e-07, "loss": 0.0014, "reward": 1.6987740993499756, "reward_std": 0.1341363899409771, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7143990695476532, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 311.5078125, "epoch": 0.23779296875, "grad_norm": 2.720150565834579, "kl": 0.0426025390625, "learning_rate": 9.405517578125e-07, "loss": 0.0017, "reward": 1.6762725114822388, "reward_std": 0.09658115357160568, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6840850114822388, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 358.7734375, "epoch": 0.23828125, "grad_norm": 3.4864904794597886, "kl": 0.0489501953125, "learning_rate": 9.404296875e-07, "loss": 0.002, "reward": 1.6006226539611816, "reward_std": 0.13109473884105682, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6162476539611816, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 313.515625, "epoch": 0.23876953125, "grad_norm": 2.6873145138368666, "kl": 0.0526123046875, "learning_rate": 9.403076171875e-07, "loss": 0.0021, "reward": 1.6052095890045166, "reward_std": 0.17836012691259384, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6520847082138062, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 280.6796875, "epoch": 0.2392578125, "grad_norm": 3.62466135309983, "kl": 0.05224609375, "learning_rate": 9.40185546875e-07, "loss": 0.0021, "reward": 1.5602875351905823, "reward_std": 0.09847164526581764, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5602875351905823, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 297.734375, "epoch": 0.23974609375, "grad_norm": 3.8234332031826432, "kl": 0.0458984375, "learning_rate": 9.400634765624999e-07, "loss": 0.0018, "reward": 1.6198760867118835, "reward_std": 0.10435886308550835, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6198760569095612, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 262.84375, "epoch": 0.240234375, "grad_norm": 1.643266133894865, "kl": 0.0531005859375, "learning_rate": 9.399414062499999e-07, "loss": 0.0021, "reward": 1.6920706629753113, "reward_std": 0.13326343521475792, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6920706927776337, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 231.40625, "epoch": 0.24072265625, "grad_norm": 3.0050999042930053, "kl": 0.04736328125, "learning_rate": 9.398193359374999e-07, "loss": 0.0019, "reward": 1.6835868954658508, "reward_std": 0.06352141872048378, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6835868954658508, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 185.0234375, "epoch": 0.2412109375, "grad_norm": 1.982897008783888, "kl": 0.0460205078125, "learning_rate": 9.39697265625e-07, "loss": 0.0018, "reward": 1.7277058362960815, "reward_std": 0.029988901689648628, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7277058362960815, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 266.3671875, "epoch": 0.24169921875, "grad_norm": 3.432562466521374, "kl": 0.0494384765625, "learning_rate": 9.395751953125e-07, "loss": 0.002, "reward": 1.630252718925476, "reward_std": 0.10348817706108093, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6302527189254761, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 371.3203125, "epoch": 0.2421875, "grad_norm": 1.938316540344301, "kl": 0.061279296875, "learning_rate": 9.39453125e-07, "loss": 0.0024, "reward": 1.5763072967529297, "reward_std": 0.1704563107341528, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5997447669506073, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 211.46875, "epoch": 0.24267578125, "grad_norm": 1.5475227992685436, "kl": 0.0443115234375, "learning_rate": 9.393310546875e-07, "loss": 0.0018, "reward": 1.796213448047638, "reward_std": 0.08575894869863987, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8040259480476379, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 221.8046875, "epoch": 0.2431640625, "grad_norm": 6.194121349015061, "kl": 0.0538330078125, "learning_rate": 9.392089843749999e-07, "loss": 0.0022, "reward": 1.7806832194328308, "reward_std": 0.04140526428818703, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7806831896305084, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 312.234375, "epoch": 0.24365234375, "grad_norm": 4.146851999613055, "kl": 0.0482177734375, "learning_rate": 9.390869140624999e-07, "loss": 0.0019, "reward": 1.6361583471298218, "reward_std": 0.12326683104038239, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6517833769321442, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 302.875, "epoch": 0.244140625, "grad_norm": 2.3192212317561327, "kl": 0.0556640625, "learning_rate": 9.3896484375e-07, "loss": 0.0022, "reward": 1.7082719802856445, "reward_std": 0.11271853744983673, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7160845100879669, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 310.09375, "epoch": 0.24462890625, "grad_norm": 2.530658345316516, "kl": 0.0523681640625, "learning_rate": 9.388427734375e-07, "loss": 0.0021, "reward": 1.6015813946723938, "reward_std": 0.11473493091762066, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6406438946723938, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 324.4296875, "epoch": 0.2451171875, "grad_norm": 3.374686115356342, "kl": 0.0780029296875, "learning_rate": 9.38720703125e-07, "loss": 0.0031, "reward": 1.670573353767395, "reward_std": 0.19502687454223633, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6940109133720398, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 323.9609375, "epoch": 0.24560546875, "grad_norm": 3.975204177665374, "kl": 0.0465087890625, "learning_rate": 9.385986328125e-07, "loss": 0.0019, "reward": 1.7460113763809204, "reward_std": 0.06656001135706902, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7460113167762756, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 328.1171875, "epoch": 0.24609375, "grad_norm": 2.3164265465647267, "kl": 0.0430908203125, "learning_rate": 9.384765624999999e-07, "loss": 0.0017, "reward": 1.789831519126892, "reward_std": 0.06405875086784363, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7898315191268921, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 295.4765625, "epoch": 0.24658203125, "grad_norm": 3.6736958175167618, "kl": 0.048095703125, "learning_rate": 9.383544921874999e-07, "loss": 0.0019, "reward": 1.7674906253814697, "reward_std": 0.04135966673493385, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7674906253814697, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 295.5859375, "epoch": 0.2470703125, "grad_norm": 2.9483608938527186, "kl": 0.0535888671875, "learning_rate": 9.382324218749999e-07, "loss": 0.0021, "reward": 1.6795161962509155, "reward_std": 0.1258496269583702, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6873287260532379, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 312.4609375, "epoch": 0.24755859375, "grad_norm": 10.00278448029272, "kl": 0.04833984375, "learning_rate": 9.381103515625e-07, "loss": 0.0019, "reward": 1.7105889916419983, "reward_std": 0.06833425909280777, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7105889916419983, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 437.2734375, "epoch": 0.248046875, "grad_norm": 22.4048260879023, "kl": 0.0421142578125, "learning_rate": 9.3798828125e-07, "loss": 0.0017, "reward": 1.7356719970703125, "reward_std": 0.14341094344854355, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7512970268726349, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 323.1328125, "epoch": 0.24853515625, "grad_norm": 3.690996101063405, "kl": 0.0587158203125, "learning_rate": 9.378662109375e-07, "loss": 0.0023, "reward": 1.5387169122695923, "reward_std": 0.15988320112228394, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.5934043824672699, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 375.171875, "epoch": 0.2490234375, "grad_norm": 1.760028110863856, "kl": 0.060302734375, "learning_rate": 9.37744140625e-07, "loss": 0.0024, "reward": 1.6823578476905823, "reward_std": 0.13578759506344795, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.690170407295227, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 289.75, "epoch": 0.24951171875, "grad_norm": 1.4416400099241444, "kl": 0.0535888671875, "learning_rate": 9.376220703124999e-07, "loss": 0.0021, "reward": 1.654783546924591, "reward_std": 0.10625720396637917, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6625960469245911, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 268.4140625, "epoch": 0.25, "grad_norm": 1.168208336958592, "kl": 0.05078125, "learning_rate": 9.374999999999999e-07, "loss": 0.002, "reward": 1.5635674595832825, "reward_std": 0.10676468908786774, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5713800489902496, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 289.8359375, "epoch": 0.25048828125, "grad_norm": 1.2959350822555695, "kl": 0.049560546875, "learning_rate": 9.373779296875e-07, "loss": 0.002, "reward": 1.6148346662521362, "reward_std": 0.05857887305319309, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6148346662521362, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 302.2734375, "epoch": 0.2509765625, "grad_norm": 2.230807942706403, "kl": 0.0537109375, "learning_rate": 9.37255859375e-07, "loss": 0.0021, "reward": 1.7130588293075562, "reward_std": 0.099614929407835, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7130588293075562, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 304.7109375, "epoch": 0.25146484375, "grad_norm": 1.3346796140624877, "kl": 0.0501708984375, "learning_rate": 9.371337890625e-07, "loss": 0.002, "reward": 1.6742193698883057, "reward_std": 0.10047866404056549, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6898443400859833, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 323.84375, "epoch": 0.251953125, "grad_norm": 1.5965524786083076, "kl": 0.0374755859375, "learning_rate": 9.3701171875e-07, "loss": 0.0015, "reward": 1.7269166707992554, "reward_std": 0.09766197204589844, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7347292006015778, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 259.8828125, "epoch": 0.25244140625, "grad_norm": 2.5203178337483747, "kl": 0.040771484375, "learning_rate": 9.368896484374999e-07, "loss": 0.0016, "reward": 1.6974033117294312, "reward_std": 0.03398803994059563, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6974032521247864, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 258.484375, "epoch": 0.2529296875, "grad_norm": 15.724284771000056, "kl": 0.0438232421875, "learning_rate": 9.367675781249999e-07, "loss": 0.0018, "reward": 1.769521713256836, "reward_std": 0.07974059507250786, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7773342132568359, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 278.6484375, "epoch": 0.25341796875, "grad_norm": 12.294650487546035, "kl": 0.041259765625, "learning_rate": 9.366455078124999e-07, "loss": 0.0017, "reward": 1.7424204349517822, "reward_std": 0.07815677672624588, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7424204349517822, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 298.1796875, "epoch": 0.25390625, "grad_norm": 1.9577664086872122, "kl": 0.0418701171875, "learning_rate": 9.365234375e-07, "loss": 0.0017, "reward": 1.5817663669586182, "reward_std": 0.10854971595108509, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5895789265632629, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 351.546875, "epoch": 0.25439453125, "grad_norm": 1.9802073044830337, "kl": 0.041259765625, "learning_rate": 9.364013671875e-07, "loss": 0.0017, "reward": 1.775498867034912, "reward_std": 0.06388038024306297, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7754988670349121, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 291.640625, "epoch": 0.2548828125, "grad_norm": 6.132107488020418, "kl": 0.054931640625, "learning_rate": 9.36279296875e-07, "loss": 0.0022, "reward": 1.8140791654586792, "reward_std": 0.060922037810087204, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8140791952610016, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 222.2109375, "epoch": 0.25537109375, "grad_norm": 2.228119855943763, "kl": 0.0501708984375, "learning_rate": 9.361572265625e-07, "loss": 0.002, "reward": 1.6406488418579102, "reward_std": 0.0903150886297226, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6406488716602325, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 294.828125, "epoch": 0.255859375, "grad_norm": 3.8874443170507753, "kl": 0.041748046875, "learning_rate": 9.360351562499999e-07, "loss": 0.0017, "reward": 1.7140488624572754, "reward_std": 0.09602710604667664, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7140487730503082, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 343.203125, "epoch": 0.25634765625, "grad_norm": 12.272163041477432, "kl": 0.0487060546875, "learning_rate": 9.359130859374999e-07, "loss": 0.0019, "reward": 1.7117294073104858, "reward_std": 0.10148574784398079, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7195418775081635, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 324.734375, "epoch": 0.2568359375, "grad_norm": 6.216510978293266, "kl": 0.0477294921875, "learning_rate": 9.35791015625e-07, "loss": 0.0019, "reward": 1.6478030681610107, "reward_std": 0.13338213600218296, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6556155979633331, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 312.2421875, "epoch": 0.25732421875, "grad_norm": 1.1994853174094693, "kl": 0.04541015625, "learning_rate": 9.356689453125e-07, "loss": 0.0018, "reward": 1.741489827632904, "reward_std": 0.0871284119784832, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.749302327632904, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 242.640625, "epoch": 0.2578125, "grad_norm": 2.7844412848504008, "kl": 0.0419921875, "learning_rate": 9.35546875e-07, "loss": 0.0017, "reward": 1.6761191487312317, "reward_std": 0.06489459797739983, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6761191487312317, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 371.9921875, "epoch": 0.25830078125, "grad_norm": 2.445482124975617, "kl": 0.05517578125, "learning_rate": 9.354248046875e-07, "loss": 0.0022, "reward": 1.649292767047882, "reward_std": 0.09409752860665321, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6492927670478821, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 318.5234375, "epoch": 0.2587890625, "grad_norm": 3.1389050778274794, "kl": 0.0455322265625, "learning_rate": 9.353027343749999e-07, "loss": 0.0018, "reward": 1.7897993326187134, "reward_std": 0.06516874581575394, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7897992730140686, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 246.453125, "epoch": 0.25927734375, "grad_norm": 5.7428193650994075, "kl": 0.0457763671875, "learning_rate": 9.351806640624999e-07, "loss": 0.0018, "reward": 1.7966364622116089, "reward_std": 0.10276348143815994, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7966364324092865, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 204.8984375, "epoch": 0.259765625, "grad_norm": 2.1752945927652134, "kl": 0.0531005859375, "learning_rate": 9.350585937499999e-07, "loss": 0.0021, "reward": 1.6889333724975586, "reward_std": 0.06486545503139496, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6889333724975586, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 274.84375, "epoch": 0.26025390625, "grad_norm": 1.3878938520937938, "kl": 0.044189453125, "learning_rate": 9.349365234375e-07, "loss": 0.0018, "reward": 1.8459346890449524, "reward_std": 0.03464473132044077, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8459346890449524, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 243.1015625, "epoch": 0.2607421875, "grad_norm": 2.9082227869155868, "kl": 0.03704833984375, "learning_rate": 9.34814453125e-07, "loss": 0.0015, "reward": 1.7827296257019043, "reward_std": 0.05504240095615387, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7827296257019043, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 311.3203125, "epoch": 0.26123046875, "grad_norm": 2.5055735390456455, "kl": 0.046630859375, "learning_rate": 9.346923828125e-07, "loss": 0.0019, "reward": 1.6670472025871277, "reward_std": 0.14498621970415115, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6826722323894501, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 229.328125, "epoch": 0.26171875, "grad_norm": 2.1695056878186287, "kl": 0.05078125, "learning_rate": 9.345703125e-07, "loss": 0.002, "reward": 1.7323461771011353, "reward_std": 0.06361747533082962, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7323460876941681, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 334.4609375, "epoch": 0.26220703125, "grad_norm": 3.3136437260627503, "kl": 0.04931640625, "learning_rate": 9.344482421874999e-07, "loss": 0.002, "reward": 1.7232381105422974, "reward_std": 0.12078379839658737, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7310506999492645, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 308.3515625, "epoch": 0.2626953125, "grad_norm": 2.456324374339407, "kl": 0.04345703125, "learning_rate": 9.343261718749999e-07, "loss": 0.0017, "reward": 1.7056252360343933, "reward_std": 0.09498313069343567, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7056251764297485, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 369.734375, "epoch": 0.26318359375, "grad_norm": 4.079059533712564, "kl": 0.039794921875, "learning_rate": 9.342041015625e-07, "loss": 0.0016, "reward": 1.4682893753051758, "reward_std": 0.1072283387184143, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.530789390206337, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 410.2265625, "epoch": 0.263671875, "grad_norm": 10.45654393994142, "kl": 0.0357666015625, "learning_rate": 9.3408203125e-07, "loss": 0.0014, "reward": 1.7631608843803406, "reward_std": 0.13105908036231995, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7787858843803406, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 339.0, "epoch": 0.26416015625, "grad_norm": 1.7102279475438495, "kl": 0.0360107421875, "learning_rate": 9.339599609375e-07, "loss": 0.0014, "reward": 1.6794561743736267, "reward_std": 0.06675281748175621, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6794561147689819, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 331.3828125, "epoch": 0.2646484375, "grad_norm": 4.522944699128447, "kl": 0.04443359375, "learning_rate": 9.33837890625e-07, "loss": 0.0018, "reward": 1.7426947355270386, "reward_std": 0.08230987191200256, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7426947355270386, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 279.65625, "epoch": 0.26513671875, "grad_norm": 2.6650826688792857, "kl": 0.0450439453125, "learning_rate": 9.337158203124999e-07, "loss": 0.0018, "reward": 1.7618120908737183, "reward_std": 0.06165020540356636, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7618121206760406, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 284.5078125, "epoch": 0.265625, "grad_norm": 2.037006291879131, "kl": 0.051025390625, "learning_rate": 9.335937499999999e-07, "loss": 0.002, "reward": 1.646928310394287, "reward_std": 0.17904935777187347, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6469283103942871, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 330.9140625, "epoch": 0.26611328125, "grad_norm": 2.297298122509907, "kl": 0.039306640625, "learning_rate": 9.334716796874999e-07, "loss": 0.0016, "reward": 1.6567280888557434, "reward_std": 0.10316119715571404, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6567280292510986, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 326.3125, "epoch": 0.2666015625, "grad_norm": 2.660291456006694, "kl": 0.039306640625, "learning_rate": 9.33349609375e-07, "loss": 0.0016, "reward": 1.823375165462494, "reward_std": 0.05490726791322231, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8233751654624939, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 266.1640625, "epoch": 0.26708984375, "grad_norm": 6.13270694396882, "kl": 0.0443115234375, "learning_rate": 9.332275390625e-07, "loss": 0.0018, "reward": 1.6771809458732605, "reward_std": 0.0902215950191021, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6771808862686157, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 361.15625, "epoch": 0.267578125, "grad_norm": 2.2633016069892777, "kl": 0.0455322265625, "learning_rate": 9.3310546875e-07, "loss": 0.0018, "reward": 1.6595528721809387, "reward_std": 0.12929360568523407, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6673653721809387, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 318.203125, "epoch": 0.26806640625, "grad_norm": 2.805590510649819, "kl": 0.0498046875, "learning_rate": 9.329833984375e-07, "loss": 0.002, "reward": 1.7251054048538208, "reward_std": 0.10590995103120804, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7407303750514984, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 342.640625, "epoch": 0.2685546875, "grad_norm": 1.5868774720575003, "kl": 0.045166015625, "learning_rate": 9.328613281249999e-07, "loss": 0.0018, "reward": 1.5312697887420654, "reward_std": 0.10218230821192265, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5390822738409042, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 282.6171875, "epoch": 0.26904296875, "grad_norm": 5.217204405936032, "kl": 0.0523681640625, "learning_rate": 9.327392578124999e-07, "loss": 0.0021, "reward": 1.6690084338188171, "reward_std": 0.11925657838582993, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7002584338188171, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 205.0625, "epoch": 0.26953125, "grad_norm": 2.332959009144248, "kl": 0.05322265625, "learning_rate": 9.326171874999999e-07, "loss": 0.0021, "reward": 1.7313638925552368, "reward_std": 0.06417965516448021, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7313639223575592, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 242.2421875, "epoch": 0.27001953125, "grad_norm": 2.6193153669407203, "kl": 0.0447998046875, "learning_rate": 9.324951171875e-07, "loss": 0.0018, "reward": 1.7203855514526367, "reward_std": 0.07075966894626617, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7203856408596039, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 316.3515625, "epoch": 0.2705078125, "grad_norm": 2.5899560028614697, "kl": 0.0489501953125, "learning_rate": 9.32373046875e-07, "loss": 0.002, "reward": 1.6837428212165833, "reward_std": 0.11842398717999458, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7071803212165833, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 243.625, "epoch": 0.27099609375, "grad_norm": 2.498447085669372, "kl": 0.0595703125, "learning_rate": 9.322509765625e-07, "loss": 0.0024, "reward": 1.6304301023483276, "reward_std": 0.07888209074735641, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6304300427436829, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 241.5, "epoch": 0.271484375, "grad_norm": 4.097199567462612, "kl": 0.072021484375, "learning_rate": 9.321289062499999e-07, "loss": 0.0029, "reward": 1.775884211063385, "reward_std": 0.07060272060334682, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7758842408657074, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 290.3046875, "epoch": 0.27197265625, "grad_norm": 4.5439033784171095, "kl": 0.0517578125, "learning_rate": 9.320068359374999e-07, "loss": 0.0021, "reward": 1.6852914690971375, "reward_std": 0.1067960262298584, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6852914988994598, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 285.8046875, "epoch": 0.2724609375, "grad_norm": 1.7251350694603302, "kl": 0.049560546875, "learning_rate": 9.318847656249999e-07, "loss": 0.002, "reward": 1.6576202511787415, "reward_std": 0.09488710761070251, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6654327809810638, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 363.03125, "epoch": 0.27294921875, "grad_norm": 3.92150434058649, "kl": 0.041015625, "learning_rate": 9.317626953125e-07, "loss": 0.0016, "reward": 1.5969886183738708, "reward_std": 0.12209014222025871, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6126136183738708, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 334.3359375, "epoch": 0.2734375, "grad_norm": 2.585612743662862, "kl": 0.0447998046875, "learning_rate": 9.31640625e-07, "loss": 0.0018, "reward": 1.6855441331863403, "reward_std": 0.11337171494960785, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6855441033840179, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 401.21875, "epoch": 0.27392578125, "grad_norm": 5.285724233905254, "kl": 0.03778076171875, "learning_rate": 9.315185546875e-07, "loss": 0.0015, "reward": 1.64777010679245, "reward_std": 0.22076285630464554, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.69464510679245, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 252.671875, "epoch": 0.2744140625, "grad_norm": 1.8126430914250469, "kl": 0.0450439453125, "learning_rate": 9.31396484375e-07, "loss": 0.0018, "reward": 1.7356610298156738, "reward_std": 0.10725349560379982, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7356610596179962, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 337.71875, "epoch": 0.27490234375, "grad_norm": 1.9656321676605797, "kl": 0.05322265625, "learning_rate": 9.312744140624999e-07, "loss": 0.0021, "reward": 1.6761003732681274, "reward_std": 0.07711060158908367, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6761003732681274, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 270.0390625, "epoch": 0.275390625, "grad_norm": 3.2377061229845836, "kl": 0.0592041015625, "learning_rate": 9.311523437499999e-07, "loss": 0.0024, "reward": 1.6362444162368774, "reward_std": 0.10095639899373055, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6518694162368774, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 297.8671875, "epoch": 0.27587890625, "grad_norm": 25.909912449399112, "kl": 0.05029296875, "learning_rate": 9.310302734374999e-07, "loss": 0.002, "reward": 1.686921238899231, "reward_std": 0.07121211476624012, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.686921238899231, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 339.859375, "epoch": 0.2763671875, "grad_norm": 2.2331716793343084, "kl": 0.0450439453125, "learning_rate": 9.30908203125e-07, "loss": 0.0018, "reward": 1.7039056420326233, "reward_std": 0.06212746538221836, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7039056420326233, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 307.9453125, "epoch": 0.27685546875, "grad_norm": 2.2291085862491617, "kl": 0.04150390625, "learning_rate": 9.307861328125e-07, "loss": 0.0017, "reward": 1.7227251529693604, "reward_std": 0.1121636014431715, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7383500933647156, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 312.796875, "epoch": 0.27734375, "grad_norm": 1.3421158309646601, "kl": 0.0401611328125, "learning_rate": 9.306640625e-07, "loss": 0.0016, "reward": 1.570694386959076, "reward_std": 0.1121312715113163, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6019443571567535, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 198.0625, "epoch": 0.27783203125, "grad_norm": 1.5020919887284745, "kl": 0.0501708984375, "learning_rate": 9.305419921875e-07, "loss": 0.002, "reward": 1.8749535083770752, "reward_std": 0.025433492846786976, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8749535381793976, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 336.2734375, "epoch": 0.2783203125, "grad_norm": 2.342753166145787, "kl": 0.0411376953125, "learning_rate": 9.304199218749999e-07, "loss": 0.0016, "reward": 1.8102790713310242, "reward_std": 0.09545211121439934, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8180915713310242, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 220.03125, "epoch": 0.27880859375, "grad_norm": 2.2742980387573652, "kl": 0.0556640625, "learning_rate": 9.302978515624999e-07, "loss": 0.0022, "reward": 1.8116941452026367, "reward_std": 0.09244917519390583, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8116941154003143, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 308.5859375, "epoch": 0.279296875, "grad_norm": 9.595557459930381, "kl": 0.0496826171875, "learning_rate": 9.3017578125e-07, "loss": 0.002, "reward": 1.6191758513450623, "reward_std": 0.09628532081842422, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6348008215427399, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 182.3671875, "epoch": 0.27978515625, "grad_norm": 18.896318107676453, "kl": 0.068115234375, "learning_rate": 9.300537109375e-07, "loss": 0.0027, "reward": 1.7079237699508667, "reward_std": 0.1069163903594017, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7079237401485443, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 245.5234375, "epoch": 0.2802734375, "grad_norm": 0.8129066081542312, "kl": 0.0462646484375, "learning_rate": 9.29931640625e-07, "loss": 0.0018, "reward": 1.6988362073898315, "reward_std": 0.011203366797417402, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6988362371921539, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 209.3984375, "epoch": 0.28076171875, "grad_norm": 0.993947791093584, "kl": 0.06201171875, "learning_rate": 9.298095703125e-07, "loss": 0.0025, "reward": 1.794031023979187, "reward_std": 0.08120781742036343, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8174684643745422, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 365.640625, "epoch": 0.28125, "grad_norm": 6.676572461323387, "kl": 0.040283203125, "learning_rate": 9.296874999999999e-07, "loss": 0.0016, "reward": 1.726797878742218, "reward_std": 0.08194676041603088, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.726797878742218, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 246.9609375, "epoch": 0.28173828125, "grad_norm": 1.5079255512272232, "kl": 0.060791015625, "learning_rate": 9.295654296874999e-07, "loss": 0.0024, "reward": 1.7783808708190918, "reward_std": 0.049073804169893265, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7783808708190918, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 367.9296875, "epoch": 0.2822265625, "grad_norm": 1.3798475015598377, "kl": 0.0418701171875, "learning_rate": 9.294433593749999e-07, "loss": 0.0017, "reward": 1.7986710667610168, "reward_std": 0.03962104860693216, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7986710667610168, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 281.7578125, "epoch": 0.28271484375, "grad_norm": 2.505394028512915, "kl": 0.0567626953125, "learning_rate": 9.293212890625e-07, "loss": 0.0023, "reward": 1.6889582872390747, "reward_std": 0.07442482188344002, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6889582574367523, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 361.0390625, "epoch": 0.283203125, "grad_norm": 1.8850278687560447, "kl": 0.0411376953125, "learning_rate": 9.2919921875e-07, "loss": 0.0016, "reward": 1.6971967816352844, "reward_std": 0.09730785340070724, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6971967816352844, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 300.4609375, "epoch": 0.28369140625, "grad_norm": 1.7641909702416805, "kl": 0.0494384765625, "learning_rate": 9.290771484375e-07, "loss": 0.002, "reward": 1.7254841923713684, "reward_std": 0.09819715097546577, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7332966923713684, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 210.7109375, "epoch": 0.2841796875, "grad_norm": 1.5034625672902855, "kl": 0.044677734375, "learning_rate": 9.28955078125e-07, "loss": 0.0018, "reward": 1.6941693425178528, "reward_std": 0.11884243786334991, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6941693425178528, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 342.484375, "epoch": 0.28466796875, "grad_norm": 1.8739175117936375, "kl": 0.056640625, "learning_rate": 9.288330078124999e-07, "loss": 0.0023, "reward": 1.7098997831344604, "reward_std": 0.13007067143917084, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7489623129367828, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 243.8828125, "epoch": 0.28515625, "grad_norm": 3.302249530616915, "kl": 0.0550537109375, "learning_rate": 9.287109374999999e-07, "loss": 0.0022, "reward": 1.8262977600097656, "reward_std": 0.07570694014430046, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8262978196144104, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 271.34375, "epoch": 0.28564453125, "grad_norm": 2.3247621756543406, "kl": 0.039794921875, "learning_rate": 9.285888671875e-07, "loss": 0.0016, "reward": 1.691820740699768, "reward_std": 0.10432455316185951, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6918207406997681, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 252.078125, "epoch": 0.2861328125, "grad_norm": 2.919815872077742, "kl": 0.0537109375, "learning_rate": 9.28466796875e-07, "loss": 0.0021, "reward": 1.8132377862930298, "reward_std": 0.0450489092618227, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8132377862930298, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 260.9453125, "epoch": 0.28662109375, "grad_norm": 2.315450631479818, "kl": 0.0491943359375, "learning_rate": 9.283447265625e-07, "loss": 0.002, "reward": 1.567336082458496, "reward_std": 0.04566051810979843, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5673360526561737, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 269.9453125, "epoch": 0.287109375, "grad_norm": 4.866170468108119, "kl": 0.0443115234375, "learning_rate": 9.2822265625e-07, "loss": 0.0018, "reward": 1.7104328870773315, "reward_std": 0.047424353659152985, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7104328274726868, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 320.1484375, "epoch": 0.28759765625, "grad_norm": 2.464877601750045, "kl": 0.0465087890625, "learning_rate": 9.281005859374999e-07, "loss": 0.0019, "reward": 1.6946337819099426, "reward_std": 0.13272637128829956, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7258838415145874, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 371.453125, "epoch": 0.2880859375, "grad_norm": 2.8034456562750654, "kl": 0.037841796875, "learning_rate": 9.279785156249999e-07, "loss": 0.0015, "reward": 1.7395535707473755, "reward_std": 0.10018676891922951, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7473660111427307, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 315.546875, "epoch": 0.28857421875, "grad_norm": 7.447215195080596, "kl": 0.043701171875, "learning_rate": 9.278564453124999e-07, "loss": 0.0017, "reward": 1.69717139005661, "reward_std": 0.09286946710199118, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7518589496612549, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 220.8125, "epoch": 0.2890625, "grad_norm": 1.4897448677701148, "kl": 0.0584716796875, "learning_rate": 9.27734375e-07, "loss": 0.0023, "reward": 1.7233901023864746, "reward_std": 0.04082547128200531, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7233900725841522, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 360.1953125, "epoch": 0.28955078125, "grad_norm": 3.7207228740501317, "kl": 0.0531005859375, "learning_rate": 9.276123046875e-07, "loss": 0.0021, "reward": 1.641968011856079, "reward_std": 0.1139497272670269, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6575929820537567, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 379.921875, "epoch": 0.2900390625, "grad_norm": 2.237099552618115, "kl": 0.03662109375, "learning_rate": 9.27490234375e-07, "loss": 0.0015, "reward": 1.646517038345337, "reward_std": 0.28582026064395905, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.7090170085430145, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 272.109375, "epoch": 0.29052734375, "grad_norm": 3.3208620909986246, "kl": 0.0455322265625, "learning_rate": 9.273681640625e-07, "loss": 0.0018, "reward": 1.6810371279716492, "reward_std": 0.08739523217082024, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6810370683670044, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 261.03125, "epoch": 0.291015625, "grad_norm": 0.9791867105353927, "kl": 0.04736328125, "learning_rate": 9.272460937499999e-07, "loss": 0.0019, "reward": 1.8734647035598755, "reward_std": 0.031122705899178982, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8734646737575531, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 302.09375, "epoch": 0.29150390625, "grad_norm": 2.1825079397965843, "kl": 0.0369873046875, "learning_rate": 9.271240234374999e-07, "loss": 0.0015, "reward": 1.8181806802749634, "reward_std": 0.06168582662940025, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8181805908679962, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 272.9296875, "epoch": 0.2919921875, "grad_norm": 2.584490232315663, "kl": 0.044677734375, "learning_rate": 9.27001953125e-07, "loss": 0.0018, "reward": 1.6417737007141113, "reward_std": 0.03647463582456112, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6417737007141113, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 256.6953125, "epoch": 0.29248046875, "grad_norm": 2.4277242597465607, "kl": 0.0513916015625, "learning_rate": 9.268798828125e-07, "loss": 0.0021, "reward": 1.7535077929496765, "reward_std": 0.08582048118114471, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7535077333450317, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 274.9609375, "epoch": 0.29296875, "grad_norm": 12.486389397389315, "kl": 0.0555419921875, "learning_rate": 9.267578125e-07, "loss": 0.0022, "reward": 1.7247052192687988, "reward_std": 0.06530194543302059, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.724705159664154, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 321.46875, "epoch": 0.29345703125, "grad_norm": 2.0579889539520035, "kl": 0.04736328125, "learning_rate": 9.266357421875e-07, "loss": 0.0019, "reward": 1.7377859354019165, "reward_std": 0.08668200299143791, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7377859652042389, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 364.4296875, "epoch": 0.2939453125, "grad_norm": 1.2811733254368138, "kl": 0.035400390625, "learning_rate": 9.265136718749999e-07, "loss": 0.0014, "reward": 1.6522246599197388, "reward_std": 0.10386989638209343, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.660037100315094, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 327.015625, "epoch": 0.29443359375, "grad_norm": 2.525240888001395, "kl": 0.0458984375, "learning_rate": 9.263916015624999e-07, "loss": 0.0018, "reward": 1.655815601348877, "reward_std": 0.11304668337106705, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6636281311511993, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 271.46875, "epoch": 0.294921875, "grad_norm": 10.03655987535627, "kl": 0.0621337890625, "learning_rate": 9.262695312499999e-07, "loss": 0.0025, "reward": 1.736948013305664, "reward_std": 0.16118024289608002, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7681980729103088, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 334.9765625, "epoch": 0.29541015625, "grad_norm": 3.25054352753453, "kl": 0.04638671875, "learning_rate": 9.261474609375e-07, "loss": 0.0019, "reward": 1.6929279565811157, "reward_std": 0.08746526017785072, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7007405161857605, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 292.3359375, "epoch": 0.2958984375, "grad_norm": 1.9216710744456194, "kl": 0.0396728515625, "learning_rate": 9.26025390625e-07, "loss": 0.0016, "reward": 1.7309820652008057, "reward_std": 0.08170492202043533, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7309820353984833, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 299.8359375, "epoch": 0.29638671875, "grad_norm": 1.833893246452737, "kl": 0.0567626953125, "learning_rate": 9.259033203125e-07, "loss": 0.0023, "reward": 1.6237656474113464, "reward_std": 0.07675194926559925, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6237656772136688, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 284.375, "epoch": 0.296875, "grad_norm": 5.031728369153867, "kl": 0.052734375, "learning_rate": 9.2578125e-07, "loss": 0.0021, "reward": 1.7372384667396545, "reward_std": 0.07356595061719418, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7450509369373322, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 306.8359375, "epoch": 0.29736328125, "grad_norm": 2.248212440247843, "kl": 0.05078125, "learning_rate": 9.256591796874999e-07, "loss": 0.002, "reward": 1.7162050604820251, "reward_std": 0.0456718523055315, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7162051498889923, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 238.515625, "epoch": 0.2978515625, "grad_norm": 1.7714015306015924, "kl": 0.0457763671875, "learning_rate": 9.255371093749999e-07, "loss": 0.0018, "reward": 1.6449219584465027, "reward_std": 0.04260050132870674, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6449219286441803, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 275.484375, "epoch": 0.29833984375, "grad_norm": 2.296362418962897, "kl": 0.0465087890625, "learning_rate": 9.254150390625e-07, "loss": 0.0019, "reward": 1.755751132965088, "reward_std": 0.11303677409887314, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7557511329650879, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 311.6171875, "epoch": 0.298828125, "grad_norm": 2.6229221995817738, "kl": 0.047607421875, "learning_rate": 9.2529296875e-07, "loss": 0.0019, "reward": 1.6748383045196533, "reward_std": 0.08769623376429081, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6826508045196533, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 305.8203125, "epoch": 0.29931640625, "grad_norm": 2.472905090535034, "kl": 0.0384521484375, "learning_rate": 9.251708984375e-07, "loss": 0.0015, "reward": 1.675347626209259, "reward_std": 0.06301023997366428, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6753476560115814, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 247.7578125, "epoch": 0.2998046875, "grad_norm": 1.968304299505306, "kl": 0.060791015625, "learning_rate": 9.25048828125e-07, "loss": 0.0024, "reward": 1.7971341013908386, "reward_std": 0.04744567163288593, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7971341013908386, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 292.796875, "epoch": 0.30029296875, "grad_norm": 4.384171526185067, "kl": 0.051025390625, "learning_rate": 9.249267578124999e-07, "loss": 0.002, "reward": 1.5328530669212341, "reward_std": 0.06077993102371693, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5328530073165894, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 349.578125, "epoch": 0.30078125, "grad_norm": 9.232583830494594, "kl": 0.0443115234375, "learning_rate": 9.248046874999999e-07, "loss": 0.0018, "reward": 1.686236560344696, "reward_std": 0.046148573979735374, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6862365305423737, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 271.75, "epoch": 0.30126953125, "grad_norm": 1.5236429587824718, "kl": 0.0550537109375, "learning_rate": 9.246826171874999e-07, "loss": 0.0022, "reward": 1.7975013256072998, "reward_std": 0.045856970362365246, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7975013852119446, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 295.796875, "epoch": 0.3017578125, "grad_norm": 4.24143148058318, "kl": 0.0596923828125, "learning_rate": 9.24560546875e-07, "loss": 0.0024, "reward": 1.7057366967201233, "reward_std": 0.09794734045863152, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7057366371154785, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 251.40625, "epoch": 0.30224609375, "grad_norm": 9.882057657700463, "kl": 0.055908203125, "learning_rate": 9.244384765625e-07, "loss": 0.0022, "reward": 1.6871796250343323, "reward_std": 0.0694831982254982, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6871796250343323, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 291.8359375, "epoch": 0.302734375, "grad_norm": 1.9889738924594182, "kl": 0.0484619140625, "learning_rate": 9.2431640625e-07, "loss": 0.0019, "reward": 1.784572958946228, "reward_std": 0.05175241082906723, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7845728695392609, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 333.1484375, "epoch": 0.30322265625, "grad_norm": 2.316432033456783, "kl": 0.0416259765625, "learning_rate": 9.241943359375e-07, "loss": 0.0017, "reward": 1.8488008379936218, "reward_std": 0.06617510505020618, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8488008677959442, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 310.6953125, "epoch": 0.3037109375, "grad_norm": 2.9435316736306847, "kl": 0.05517578125, "learning_rate": 9.240722656249999e-07, "loss": 0.0022, "reward": 1.6083208918571472, "reward_std": 0.15882696211338043, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6239458322525024, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 209.2734375, "epoch": 0.30419921875, "grad_norm": 2.620065207948406, "kl": 0.05712890625, "learning_rate": 9.239501953124999e-07, "loss": 0.0023, "reward": 1.5121939182281494, "reward_std": 0.08841052651405334, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5121939033269882, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 396.5, "epoch": 0.3046875, "grad_norm": 1.706973036482709, "kl": 0.0498046875, "learning_rate": 9.23828125e-07, "loss": 0.002, "reward": 1.7409059405326843, "reward_std": 0.14900105446577072, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7487184107303619, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 208.3359375, "epoch": 0.30517578125, "grad_norm": 1.697511564265202, "kl": 0.0692138671875, "learning_rate": 9.237060546875e-07, "loss": 0.0028, "reward": 1.623015284538269, "reward_std": 0.08251120336353779, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6230152547359467, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 280.28125, "epoch": 0.3056640625, "grad_norm": 1.6225110716919982, "kl": 0.0494384765625, "learning_rate": 9.23583984375e-07, "loss": 0.002, "reward": 1.8155426383018494, "reward_std": 0.040754200890660286, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8155426383018494, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 219.4296875, "epoch": 0.30615234375, "grad_norm": 5.245194521239568, "kl": 0.0635986328125, "learning_rate": 9.234619140625e-07, "loss": 0.0025, "reward": 1.7196524143218994, "reward_std": 0.16773709654808044, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7274648249149323, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 366.7109375, "epoch": 0.306640625, "grad_norm": 2.495211042283095, "kl": 0.041748046875, "learning_rate": 9.233398437499999e-07, "loss": 0.0017, "reward": 1.7418628334999084, "reward_std": 0.06394334509968758, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7418628334999084, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 318.515625, "epoch": 0.30712890625, "grad_norm": 1.074724101016986, "kl": 0.0457763671875, "learning_rate": 9.232177734374999e-07, "loss": 0.0018, "reward": 1.6888737678527832, "reward_std": 0.15227380208671093, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6966862678527832, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 252.9765625, "epoch": 0.3076171875, "grad_norm": 2.138600967420769, "kl": 0.05224609375, "learning_rate": 9.230957031249999e-07, "loss": 0.0021, "reward": 1.528347134590149, "reward_std": 0.06925049610435963, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5283471196889877, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 299.2109375, "epoch": 0.30810546875, "grad_norm": 2.2724969029807904, "kl": 0.050048828125, "learning_rate": 9.229736328125e-07, "loss": 0.002, "reward": 1.6192744374275208, "reward_std": 0.10097651556134224, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6270869076251984, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 354.6796875, "epoch": 0.30859375, "grad_norm": 4.734654446629603, "kl": 0.0465087890625, "learning_rate": 9.228515625e-07, "loss": 0.0019, "reward": 1.5857577323913574, "reward_std": 0.14710739254951477, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5935702323913574, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 280.828125, "epoch": 0.30908203125, "grad_norm": 1.3342057084147274, "kl": 0.0533447265625, "learning_rate": 9.227294921875e-07, "loss": 0.0021, "reward": 1.7136409878730774, "reward_std": 0.10766054317355156, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7292659878730774, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 340.0703125, "epoch": 0.3095703125, "grad_norm": 1.5671664183297946, "kl": 0.0543212890625, "learning_rate": 9.22607421875e-07, "loss": 0.0022, "reward": 1.7468852996826172, "reward_std": 0.1310337483882904, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.76251021027565, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 294.53125, "epoch": 0.31005859375, "grad_norm": 1.1794567116723214, "kl": 0.0552978515625, "learning_rate": 9.224853515624999e-07, "loss": 0.0022, "reward": 1.6757075786590576, "reward_std": 0.07465480640530586, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6757076382637024, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 350.1328125, "epoch": 0.310546875, "grad_norm": 1.1485463519752817, "kl": 0.0455322265625, "learning_rate": 9.223632812499999e-07, "loss": 0.0018, "reward": 1.8436731696128845, "reward_std": 0.05472866632044315, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8436731696128845, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 242.2265625, "epoch": 0.31103515625, "grad_norm": 1.6053099530571169, "kl": 0.04443359375, "learning_rate": 9.222412109375e-07, "loss": 0.0018, "reward": 1.828328251838684, "reward_std": 0.07346354052424431, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8283282518386841, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 334.484375, "epoch": 0.3115234375, "grad_norm": 2.3253920128947945, "kl": 0.053466796875, "learning_rate": 9.22119140625e-07, "loss": 0.0021, "reward": 1.528764247894287, "reward_std": 0.16795818135142326, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.5678267776966095, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 256.125, "epoch": 0.31201171875, "grad_norm": 6.0280239889864164, "kl": 0.058837890625, "learning_rate": 9.219970703125e-07, "loss": 0.0024, "reward": 1.783986210823059, "reward_std": 0.06189366802573204, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7917985916137695, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 326.7109375, "epoch": 0.3125, "grad_norm": 2.344773690623761, "kl": 0.04766845703125, "learning_rate": 9.21875e-07, "loss": 0.0019, "reward": 1.6732546091079712, "reward_std": 0.20847465842962265, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7279421091079712, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 233.515625, "epoch": 0.31298828125, "grad_norm": 7.004317625286649, "kl": 0.041259765625, "learning_rate": 9.217529296874999e-07, "loss": 0.0016, "reward": 1.760383129119873, "reward_std": 0.04730805940926075, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7603830993175507, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 302.4140625, "epoch": 0.3134765625, "grad_norm": 3.3601207436539684, "kl": 0.0477294921875, "learning_rate": 9.216308593749999e-07, "loss": 0.0019, "reward": 1.750933825969696, "reward_std": 0.04815097339451313, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7509337961673737, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 244.2265625, "epoch": 0.31396484375, "grad_norm": 1.5510742673980848, "kl": 0.059814453125, "learning_rate": 9.215087890624999e-07, "loss": 0.0024, "reward": 1.720008671283722, "reward_std": 0.04548669047653675, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7200086712837219, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 344.296875, "epoch": 0.314453125, "grad_norm": 1.4934380093652202, "kl": 0.055419921875, "learning_rate": 9.2138671875e-07, "loss": 0.0022, "reward": 1.7194246053695679, "reward_std": 0.11597402952611446, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7350495755672455, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 271.40625, "epoch": 0.31494140625, "grad_norm": 2.2139528602712155, "kl": 0.0509033203125, "learning_rate": 9.212646484375e-07, "loss": 0.002, "reward": 1.7015312910079956, "reward_std": 0.03155198786407709, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7015312910079956, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 261.7578125, "epoch": 0.3154296875, "grad_norm": 1.1749981752079521, "kl": 0.0499267578125, "learning_rate": 9.21142578125e-07, "loss": 0.002, "reward": 1.6304461359977722, "reward_std": 0.12662875652313232, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6538836359977722, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 458.6484375, "epoch": 0.31591796875, "grad_norm": 1.9319087488376623, "kl": 0.0384521484375, "learning_rate": 9.210205078125e-07, "loss": 0.0015, "reward": 1.594020962715149, "reward_std": 0.20690031349658966, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6487084329128265, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 269.0625, "epoch": 0.31640625, "grad_norm": 4.887517766882272, "kl": 0.0421142578125, "learning_rate": 9.208984374999999e-07, "loss": 0.0017, "reward": 1.6909980773925781, "reward_std": 0.045924630016088486, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6909981518983841, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 333.6640625, "epoch": 0.31689453125, "grad_norm": 5.793761071887375, "kl": 0.0443115234375, "learning_rate": 9.207763671874999e-07, "loss": 0.0018, "reward": 1.7228538393974304, "reward_std": 0.11347953602671623, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7228538393974304, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 296.6640625, "epoch": 0.3173828125, "grad_norm": 1.974099723600187, "kl": 0.0489501953125, "learning_rate": 9.206542968749999e-07, "loss": 0.002, "reward": 1.7972348928451538, "reward_std": 0.08608914166688919, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7972348928451538, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 259.0546875, "epoch": 0.31787109375, "grad_norm": 1.0500416763251224, "kl": 0.05810546875, "learning_rate": 9.205322265625e-07, "loss": 0.0023, "reward": 1.6828134655952454, "reward_std": 0.04013410210609436, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.682813435792923, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 298.4375, "epoch": 0.318359375, "grad_norm": 2.2876063816784025, "kl": 0.0438232421875, "learning_rate": 9.2041015625e-07, "loss": 0.0018, "reward": 1.6353506445884705, "reward_std": 0.06418109219521284, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6353506445884705, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 327.3984375, "epoch": 0.31884765625, "grad_norm": 4.503532269564074, "kl": 0.04638671875, "learning_rate": 9.202880859375e-07, "loss": 0.0019, "reward": 1.6205086708068848, "reward_std": 0.06150331161916256, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6205087304115295, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 309.3125, "epoch": 0.3193359375, "grad_norm": 2.2645762992688674, "kl": 0.0555419921875, "learning_rate": 9.201660156249999e-07, "loss": 0.0022, "reward": 1.7717258930206299, "reward_std": 0.11558713018894196, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7873509228229523, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 240.8046875, "epoch": 0.31982421875, "grad_norm": 2.5450846485367165, "kl": 0.067626953125, "learning_rate": 9.200439453124999e-07, "loss": 0.0027, "reward": 1.62615168094635, "reward_std": 0.08513330668210983, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6261517405509949, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 284.3046875, "epoch": 0.3203125, "grad_norm": 1.3612683167143704, "kl": 0.044921875, "learning_rate": 9.199218749999999e-07, "loss": 0.0018, "reward": 1.7125912308692932, "reward_std": 0.028453302569687366, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.712591290473938, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 298.6953125, "epoch": 0.32080078125, "grad_norm": 1.485856746527386, "kl": 0.0552978515625, "learning_rate": 9.197998046875e-07, "loss": 0.0022, "reward": 1.7644490003585815, "reward_std": 0.05199288483709097, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7722615003585815, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 303.5703125, "epoch": 0.3212890625, "grad_norm": 2.4798625266117824, "kl": 0.0465087890625, "learning_rate": 9.19677734375e-07, "loss": 0.0019, "reward": 1.6799516081809998, "reward_std": 0.09173119999468327, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6799516975879669, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 298.4609375, "epoch": 0.32177734375, "grad_norm": 1.7859241168898383, "kl": 0.0496826171875, "learning_rate": 9.195556640625e-07, "loss": 0.002, "reward": 1.6372400522232056, "reward_std": 0.06941110268235207, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6450526714324951, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 253.21875, "epoch": 0.322265625, "grad_norm": 9.488602048011655, "kl": 0.05810546875, "learning_rate": 9.1943359375e-07, "loss": 0.0023, "reward": 1.744973599910736, "reward_std": 0.07817739248275757, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7449735701084137, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 306.84375, "epoch": 0.32275390625, "grad_norm": 1.4547620498319576, "kl": 0.047119140625, "learning_rate": 9.193115234374999e-07, "loss": 0.0019, "reward": 1.713306725025177, "reward_std": 0.050102658569812775, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7133066952228546, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 242.6171875, "epoch": 0.3232421875, "grad_norm": 1.5620710613142204, "kl": 0.053466796875, "learning_rate": 9.191894531249999e-07, "loss": 0.0021, "reward": 1.6234807968139648, "reward_std": 0.10660821199417114, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6391057670116425, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 284.1484375, "epoch": 0.32373046875, "grad_norm": 2.8851167506317354, "kl": 0.0604248046875, "learning_rate": 9.190673828124999e-07, "loss": 0.0024, "reward": 1.7698102593421936, "reward_std": 0.09677816927433014, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7698102295398712, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 325.7421875, "epoch": 0.32421875, "grad_norm": 0.7451981174641832, "kl": 0.0447998046875, "learning_rate": 9.189453125e-07, "loss": 0.0018, "reward": 1.6291025876998901, "reward_std": 0.08335762098431587, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6291025280952454, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 284.9453125, "epoch": 0.32470703125, "grad_norm": 12.413290735105186, "kl": 0.053955078125, "learning_rate": 9.188232421875e-07, "loss": 0.0022, "reward": 1.7409712076187134, "reward_std": 0.06279715150594711, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7409711480140686, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 220.6875, "epoch": 0.3251953125, "grad_norm": 2.01957679366167, "kl": 0.0726318359375, "learning_rate": 9.18701171875e-07, "loss": 0.0029, "reward": 1.8215317130088806, "reward_std": 0.035058433189988136, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8215316832065582, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 221.796875, "epoch": 0.32568359375, "grad_norm": 1.526764750037048, "kl": 0.0648193359375, "learning_rate": 9.185791015625e-07, "loss": 0.0026, "reward": 1.5901724696159363, "reward_std": 0.1056349128484726, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5979849547147751, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 323.4609375, "epoch": 0.326171875, "grad_norm": 2.608301473030279, "kl": 0.055419921875, "learning_rate": 9.184570312499999e-07, "loss": 0.0022, "reward": 1.6940342783927917, "reward_std": 0.14149951934814453, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7174717485904694, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 308.4921875, "epoch": 0.32666015625, "grad_norm": 2.87920366371091, "kl": 0.0506591796875, "learning_rate": 9.183349609374999e-07, "loss": 0.002, "reward": 1.698991298675537, "reward_std": 0.14888149499893188, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7302412986755371, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 292.3828125, "epoch": 0.3271484375, "grad_norm": 2.6608957988350466, "kl": 0.05029296875, "learning_rate": 9.18212890625e-07, "loss": 0.002, "reward": 1.738844633102417, "reward_std": 0.10035024397075176, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7466571033000946, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 307.953125, "epoch": 0.32763671875, "grad_norm": 1.0057918255069163, "kl": 0.0528564453125, "learning_rate": 9.180908203125e-07, "loss": 0.0021, "reward": 1.6742581129074097, "reward_std": 0.09272240474820137, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6898830831050873, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 314.078125, "epoch": 0.328125, "grad_norm": 2.8489826502257394, "kl": 0.0623779296875, "learning_rate": 9.1796875e-07, "loss": 0.0025, "reward": 1.7578362226486206, "reward_std": 0.1680883914232254, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7890861630439758, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 376.671875, "epoch": 0.32861328125, "grad_norm": 9.232271074394879, "kl": 0.0506591796875, "learning_rate": 9.178466796875e-07, "loss": 0.002, "reward": 1.664437174797058, "reward_std": 0.11197399348020554, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6956871449947357, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 314.578125, "epoch": 0.3291015625, "grad_norm": 1.317088116181306, "kl": 0.0521240234375, "learning_rate": 9.177246093749999e-07, "loss": 0.0021, "reward": 1.7777928113937378, "reward_std": 0.06812568381428719, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7777928411960602, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 288.6796875, "epoch": 0.32958984375, "grad_norm": 8.928077235738657, "kl": 0.0562744140625, "learning_rate": 9.176025390624999e-07, "loss": 0.0023, "reward": 1.752385139465332, "reward_std": 0.08091514930129051, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7523851096630096, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 307.4375, "epoch": 0.330078125, "grad_norm": 3.3096762270610833, "kl": 0.055419921875, "learning_rate": 9.174804687499999e-07, "loss": 0.0022, "reward": 1.693404495716095, "reward_std": 0.09680695086717606, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6934045851230621, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 287.6796875, "epoch": 0.33056640625, "grad_norm": 3.731533977246003, "kl": 0.09912109375, "learning_rate": 9.173583984375e-07, "loss": 0.004, "reward": 1.6468342542648315, "reward_std": 0.06382020935416222, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6468342244625092, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 339.1328125, "epoch": 0.3310546875, "grad_norm": 3.732612140042579, "kl": 0.05029296875, "learning_rate": 9.17236328125e-07, "loss": 0.002, "reward": 1.7785282731056213, "reward_std": 0.14171504974365234, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7785282731056213, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 281.671875, "epoch": 0.33154296875, "grad_norm": 18.476333671843648, "kl": 0.0657958984375, "learning_rate": 9.171142578125e-07, "loss": 0.0026, "reward": 1.6994601488113403, "reward_std": 0.054649246856570244, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6994601488113403, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 243.6953125, "epoch": 0.33203125, "grad_norm": 3.4222567750636594, "kl": 0.0540771484375, "learning_rate": 9.169921875e-07, "loss": 0.0022, "reward": 1.7493921518325806, "reward_std": 0.036320459097623825, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7493922114372253, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 270.046875, "epoch": 0.33251953125, "grad_norm": 7.10011144738821, "kl": 0.0543212890625, "learning_rate": 9.168701171874999e-07, "loss": 0.0022, "reward": 1.6045172810554504, "reward_std": 0.06905798241496086, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.604517251253128, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 344.96875, "epoch": 0.3330078125, "grad_norm": 2.739310274689876, "kl": 0.053955078125, "learning_rate": 9.167480468749999e-07, "loss": 0.0022, "reward": 1.7031362056732178, "reward_std": 0.0952284187078476, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7109486758708954, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 314.96875, "epoch": 0.33349609375, "grad_norm": 1.8552315668461727, "kl": 0.0484619140625, "learning_rate": 9.166259765625e-07, "loss": 0.0019, "reward": 1.6751747131347656, "reward_std": 0.03803575597703457, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6751746535301208, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 260.1953125, "epoch": 0.333984375, "grad_norm": 3.66299552441634, "kl": 0.064208984375, "learning_rate": 9.1650390625e-07, "loss": 0.0026, "reward": 1.6129703521728516, "reward_std": 0.06539808213710785, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6129703521728516, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 388.4453125, "epoch": 0.33447265625, "grad_norm": 1.7148741325377912, "kl": 0.052978515625, "learning_rate": 9.163818359375e-07, "loss": 0.0021, "reward": 1.6778011322021484, "reward_std": 0.16429652273654938, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7168635725975037, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 295.640625, "epoch": 0.3349609375, "grad_norm": 2.4636169099366687, "kl": 0.052001953125, "learning_rate": 9.16259765625e-07, "loss": 0.0021, "reward": 1.7410337924957275, "reward_std": 0.10263085551559925, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7566587924957275, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 287.3125, "epoch": 0.33544921875, "grad_norm": 2.3867093626757043, "kl": 0.0496826171875, "learning_rate": 9.161376953124999e-07, "loss": 0.002, "reward": 1.7729321718215942, "reward_std": 0.04040984623134136, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7729321420192719, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 305.40625, "epoch": 0.3359375, "grad_norm": 2.7157119941800283, "kl": 0.059326171875, "learning_rate": 9.160156249999999e-07, "loss": 0.0024, "reward": 1.644744098186493, "reward_std": 0.08117583952844143, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6447441577911377, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 285.765625, "epoch": 0.33642578125, "grad_norm": 3.1337083108597916, "kl": 0.057861328125, "learning_rate": 9.158935546874999e-07, "loss": 0.0023, "reward": 1.7134816646575928, "reward_std": 0.10441340506076813, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7134817242622375, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 237.7734375, "epoch": 0.3369140625, "grad_norm": 3.76318572842791, "kl": 0.05908203125, "learning_rate": 9.15771484375e-07, "loss": 0.0024, "reward": 1.673618733882904, "reward_std": 0.055534401908516884, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6736188232898712, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 327.1875, "epoch": 0.33740234375, "grad_norm": 3.0202608593298432, "kl": 0.0506591796875, "learning_rate": 9.156494140625e-07, "loss": 0.002, "reward": 1.7747870087623596, "reward_std": 0.10330940037965775, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7747870087623596, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 286.5078125, "epoch": 0.337890625, "grad_norm": 2.0419796805050003, "kl": 0.05224609375, "learning_rate": 9.1552734375e-07, "loss": 0.0021, "reward": 1.6370163559913635, "reward_std": 0.09477332793176174, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6526413559913635, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 468.140625, "epoch": 0.33837890625, "grad_norm": 1.6907575692886263, "kl": 0.0462646484375, "learning_rate": 9.154052734375e-07, "loss": 0.0019, "reward": 1.7515615820884705, "reward_std": 0.1177232563495636, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7671865224838257, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 319.28125, "epoch": 0.3388671875, "grad_norm": 2.8360519678229092, "kl": 0.0552978515625, "learning_rate": 9.152832031249999e-07, "loss": 0.0022, "reward": 1.6726796627044678, "reward_std": 0.11348319053649902, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6883046329021454, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 267.8515625, "epoch": 0.33935546875, "grad_norm": 2.3989655228404994, "kl": 0.0654296875, "learning_rate": 9.151611328124999e-07, "loss": 0.0026, "reward": 1.5922715663909912, "reward_std": 0.09356234222650528, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6391465961933136, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 287.9296875, "epoch": 0.33984375, "grad_norm": 2.653042005534019, "kl": 0.053955078125, "learning_rate": 9.150390625e-07, "loss": 0.0022, "reward": 1.656396508216858, "reward_std": 0.09683545306324959, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6642089486122131, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 219.328125, "epoch": 0.34033203125, "grad_norm": 1.7573427123722254, "kl": 0.06591796875, "learning_rate": 9.149169921875e-07, "loss": 0.0026, "reward": 1.6544893980026245, "reward_std": 0.0390294985845685, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6544894278049469, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 361.8984375, "epoch": 0.3408203125, "grad_norm": 1.3553270738396268, "kl": 0.05078125, "learning_rate": 9.14794921875e-07, "loss": 0.002, "reward": 1.7640778422355652, "reward_std": 0.07602404989302158, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7718902826309204, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 329.078125, "epoch": 0.34130859375, "grad_norm": 3.683035859736295, "kl": 0.0499267578125, "learning_rate": 9.146728515625e-07, "loss": 0.002, "reward": 1.7418290376663208, "reward_std": 0.1612987220287323, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7652665674686432, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 392.75, "epoch": 0.341796875, "grad_norm": 3.298949549269414, "kl": 0.0498046875, "learning_rate": 9.145507812499999e-07, "loss": 0.002, "reward": 1.6274558901786804, "reward_std": 0.15485302917659283, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6665183305740356, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 330.75, "epoch": 0.34228515625, "grad_norm": 7.338680707895053, "kl": 0.0615234375, "learning_rate": 9.144287109374999e-07, "loss": 0.0025, "reward": 1.7153563499450684, "reward_std": 0.13466084748506546, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7387937903404236, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 348.375, "epoch": 0.3427734375, "grad_norm": 1.4740042030073908, "kl": 0.0465087890625, "learning_rate": 9.143066406249999e-07, "loss": 0.0019, "reward": 1.674963891506195, "reward_std": 0.04929056763648987, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6749638915061951, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 341.7578125, "epoch": 0.34326171875, "grad_norm": 2.2370564002640707, "kl": 0.0498046875, "learning_rate": 9.141845703125e-07, "loss": 0.002, "reward": 1.6746094226837158, "reward_std": 0.1111318301409483, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.705859363079071, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 239.7578125, "epoch": 0.34375, "grad_norm": 3.652860145128934, "kl": 0.0562744140625, "learning_rate": 9.140625e-07, "loss": 0.0022, "reward": 1.8176313638687134, "reward_std": 0.10441552102565765, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.817631334066391, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 328.59375, "epoch": 0.34423828125, "grad_norm": 1.9373839999803215, "kl": 0.052734375, "learning_rate": 9.139404296875e-07, "loss": 0.0021, "reward": 1.6220948100090027, "reward_std": 0.12009907513856888, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.6924073398113251, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 280.734375, "epoch": 0.3447265625, "grad_norm": 10.496349482097362, "kl": 0.05029296875, "learning_rate": 9.13818359375e-07, "loss": 0.002, "reward": 1.7156425714492798, "reward_std": 0.10354878753423691, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7468925714492798, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 340.6796875, "epoch": 0.34521484375, "grad_norm": 2.189163149145361, "kl": 0.043701171875, "learning_rate": 9.136962890624999e-07, "loss": 0.0017, "reward": 1.7362082600593567, "reward_std": 0.1519409567117691, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7596457600593567, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 221.6640625, "epoch": 0.345703125, "grad_norm": 3.342079279569258, "kl": 0.0635986328125, "learning_rate": 9.135742187499999e-07, "loss": 0.0025, "reward": 1.8435426950454712, "reward_std": 0.03380415961146355, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8435426652431488, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 316.875, "epoch": 0.34619140625, "grad_norm": 2.1844352528050766, "kl": 0.048583984375, "learning_rate": 9.134521484375e-07, "loss": 0.0019, "reward": 1.6824636459350586, "reward_std": 0.10241992585361004, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7059011459350586, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 270.828125, "epoch": 0.3466796875, "grad_norm": 23.655131129237514, "kl": 0.0670166015625, "learning_rate": 9.13330078125e-07, "loss": 0.0027, "reward": 1.7630151510238647, "reward_std": 0.10404928401112556, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7630151808261871, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 425.203125, "epoch": 0.34716796875, "grad_norm": 3.1467811559428855, "kl": 0.039794921875, "learning_rate": 9.132080078125e-07, "loss": 0.0016, "reward": 1.6210336685180664, "reward_std": 0.10416779294610023, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6210336685180664, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 257.421875, "epoch": 0.34765625, "grad_norm": 2.5883626552063275, "kl": 0.0665283203125, "learning_rate": 9.130859375e-07, "loss": 0.0027, "reward": 1.679788887500763, "reward_std": 0.08418247289955616, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6797888875007629, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 311.5546875, "epoch": 0.34814453125, "grad_norm": 4.799696463307074, "kl": 0.0499267578125, "learning_rate": 9.129638671874999e-07, "loss": 0.002, "reward": 1.6627951860427856, "reward_std": 0.21561793982982635, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7174826860427856, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 307.8125, "epoch": 0.3486328125, "grad_norm": 5.690753458769312, "kl": 0.055908203125, "learning_rate": 9.128417968749999e-07, "loss": 0.0022, "reward": 1.596695363521576, "reward_std": 0.08007179386913776, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5966953039169312, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 334.34375, "epoch": 0.34912109375, "grad_norm": 1.9981939045151302, "kl": 0.0426025390625, "learning_rate": 9.127197265624999e-07, "loss": 0.0017, "reward": 1.65777987241745, "reward_std": 0.07475204393267632, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6577799022197723, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 395.1171875, "epoch": 0.349609375, "grad_norm": 1.0717553143893126, "kl": 0.039306640625, "learning_rate": 9.1259765625e-07, "loss": 0.0016, "reward": 1.5546205639839172, "reward_std": 0.13567753694951534, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6014955639839172, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 415.1015625, "epoch": 0.35009765625, "grad_norm": 3.8384389147264586, "kl": 0.046142578125, "learning_rate": 9.124755859375e-07, "loss": 0.0018, "reward": 1.665941596031189, "reward_std": 0.16234686970710754, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7128165364265442, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 250.4296875, "epoch": 0.3505859375, "grad_norm": 5.987053658956865, "kl": 0.0538330078125, "learning_rate": 9.12353515625e-07, "loss": 0.0022, "reward": 1.6448271870613098, "reward_std": 0.04302874393761158, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6448271870613098, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 318.46875, "epoch": 0.35107421875, "grad_norm": 1.5749796860637693, "kl": 0.0450439453125, "learning_rate": 9.122314453125e-07, "loss": 0.0018, "reward": 1.682013988494873, "reward_std": 0.16238265484571457, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6976390182971954, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 284.234375, "epoch": 0.3515625, "grad_norm": 1.321614461326094, "kl": 0.052978515625, "learning_rate": 9.121093749999999e-07, "loss": 0.0021, "reward": 1.6903913617134094, "reward_std": 0.06298444792628288, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6903913915157318, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 310.5390625, "epoch": 0.35205078125, "grad_norm": 1.521983159109129, "kl": 0.05126953125, "learning_rate": 9.119873046874999e-07, "loss": 0.002, "reward": 1.662535011768341, "reward_std": 0.09836256504058838, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6703475117683411, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 421.8046875, "epoch": 0.3525390625, "grad_norm": 5.592012602896816, "kl": 0.0384521484375, "learning_rate": 9.11865234375e-07, "loss": 0.0015, "reward": 1.6349376440048218, "reward_std": 0.2715572118759155, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6974376440048218, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 275.828125, "epoch": 0.35302734375, "grad_norm": 7.607326995892132, "kl": 0.0521240234375, "learning_rate": 9.117431640625e-07, "loss": 0.0021, "reward": 1.7890739440917969, "reward_std": 0.09455129504203796, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7890739738941193, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 274.28125, "epoch": 0.353515625, "grad_norm": 2.0564996524853925, "kl": 0.048095703125, "learning_rate": 9.1162109375e-07, "loss": 0.0019, "reward": 1.75455242395401, "reward_std": 0.05431245639920235, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.75455242395401, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 242.390625, "epoch": 0.35400390625, "grad_norm": 1.891536592962387, "kl": 0.0587158203125, "learning_rate": 9.114990234375e-07, "loss": 0.0023, "reward": 1.6842128038406372, "reward_std": 0.0393197163939476, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6842128038406372, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 258.7109375, "epoch": 0.3544921875, "grad_norm": 2.6735308908162017, "kl": 0.050537109375, "learning_rate": 9.113769531249999e-07, "loss": 0.002, "reward": 1.677639126777649, "reward_std": 0.04795477353036404, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6776390969753265, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 373.1953125, "epoch": 0.35498046875, "grad_norm": 4.067919313994053, "kl": 0.0478515625, "learning_rate": 9.112548828124999e-07, "loss": 0.0019, "reward": 1.759274661540985, "reward_std": 0.16164502501487732, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7670871615409851, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 238.03125, "epoch": 0.35546875, "grad_norm": 1.4554737342898283, "kl": 0.05859375, "learning_rate": 9.111328124999999e-07, "loss": 0.0023, "reward": 1.6114553213119507, "reward_std": 0.07473801448941231, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6192677319049835, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 284.0078125, "epoch": 0.35595703125, "grad_norm": 5.937471894743553, "kl": 0.0450439453125, "learning_rate": 9.110107421875e-07, "loss": 0.0018, "reward": 1.8060181140899658, "reward_std": 0.06944678723812103, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8060181140899658, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 283.1640625, "epoch": 0.3564453125, "grad_norm": 2.892417832234497, "kl": 0.046875, "learning_rate": 9.10888671875e-07, "loss": 0.0019, "reward": 1.699116826057434, "reward_std": 0.030583031941205263, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6991168558597565, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 354.78125, "epoch": 0.35693359375, "grad_norm": 4.291871230697797, "kl": 0.05224609375, "learning_rate": 9.107666015625e-07, "loss": 0.0021, "reward": 1.6441110372543335, "reward_std": 0.06929890811443329, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6441109776496887, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 261.1640625, "epoch": 0.357421875, "grad_norm": 5.6899357611370975, "kl": 0.0576171875, "learning_rate": 9.1064453125e-07, "loss": 0.0023, "reward": 1.7429944276809692, "reward_std": 0.05181153491139412, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7429944574832916, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 302.9765625, "epoch": 0.35791015625, "grad_norm": 2.0271310342750906, "kl": 0.045654296875, "learning_rate": 9.105224609374999e-07, "loss": 0.0018, "reward": 1.7651514410972595, "reward_std": 0.05858886428177357, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7651514708995819, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 302.46875, "epoch": 0.3583984375, "grad_norm": 4.625382081677046, "kl": 0.056640625, "learning_rate": 9.104003906249999e-07, "loss": 0.0023, "reward": 1.7296615242958069, "reward_std": 0.18934501707553864, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7374739944934845, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 289.25, "epoch": 0.35888671875, "grad_norm": 2.106702124053642, "kl": 0.0462646484375, "learning_rate": 9.102783203125e-07, "loss": 0.0019, "reward": 1.6757431626319885, "reward_std": 0.03387642838060856, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6757431626319885, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 250.421875, "epoch": 0.359375, "grad_norm": 2.2254393181907464, "kl": 0.056884765625, "learning_rate": 9.1015625e-07, "loss": 0.0023, "reward": 1.672728419303894, "reward_std": 0.057467855513095856, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6727283895015717, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 281.921875, "epoch": 0.35986328125, "grad_norm": 2.8705551314702804, "kl": 0.047607421875, "learning_rate": 9.100341796875e-07, "loss": 0.0019, "reward": 1.7556483745574951, "reward_std": 0.12223165854811668, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7634609639644623, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 278.859375, "epoch": 0.3603515625, "grad_norm": 1.9718644163448455, "kl": 0.064208984375, "learning_rate": 9.09912109375e-07, "loss": 0.0026, "reward": 1.5958901643753052, "reward_std": 0.1189500167965889, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6037026047706604, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 273.140625, "epoch": 0.36083984375, "grad_norm": 1.1551556849612885, "kl": 0.053466796875, "learning_rate": 9.097900390624999e-07, "loss": 0.0021, "reward": 1.6770064234733582, "reward_std": 0.06842825934290886, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6770065128803253, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 261.5, "epoch": 0.361328125, "grad_norm": 1.611938756026289, "kl": 0.0496826171875, "learning_rate": 9.096679687499999e-07, "loss": 0.002, "reward": 1.871577262878418, "reward_std": 0.08976521715521812, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8793897330760956, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 227.3046875, "epoch": 0.36181640625, "grad_norm": 2.2338765528780593, "kl": 0.06201171875, "learning_rate": 9.095458984374999e-07, "loss": 0.0025, "reward": 1.7528924942016602, "reward_std": 0.06052309833467007, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7528924942016602, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 233.9609375, "epoch": 0.3623046875, "grad_norm": 2.2843239877131483, "kl": 0.04931640625, "learning_rate": 9.09423828125e-07, "loss": 0.002, "reward": 1.7976442575454712, "reward_std": 0.07765695080161095, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7976443469524384, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 234.171875, "epoch": 0.36279296875, "grad_norm": 2.5837045435100285, "kl": 0.054443359375, "learning_rate": 9.093017578125e-07, "loss": 0.0022, "reward": 1.7078853845596313, "reward_std": 0.08771786838769913, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7156979441642761, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 252.65625, "epoch": 0.36328125, "grad_norm": 14.063807731932368, "kl": 0.05712890625, "learning_rate": 9.091796875e-07, "loss": 0.0023, "reward": 1.705945611000061, "reward_std": 0.08700169250369072, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7059455513954163, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 278.1796875, "epoch": 0.36376953125, "grad_norm": 88.25210889413222, "kl": 0.0533447265625, "learning_rate": 9.090576171875e-07, "loss": 0.0021, "reward": 1.6465200781822205, "reward_std": 0.10988815873861313, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6465200483798981, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 322.8203125, "epoch": 0.3642578125, "grad_norm": 7.516718824656179, "kl": 0.051513671875, "learning_rate": 9.089355468749999e-07, "loss": 0.0021, "reward": 1.7468097805976868, "reward_std": 0.06035367026925087, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7468098104000092, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 266.4765625, "epoch": 0.36474609375, "grad_norm": 2.90080670052233, "kl": 0.082275390625, "learning_rate": 9.088134765624999e-07, "loss": 0.0033, "reward": 1.7886146306991577, "reward_std": 0.053633132949471474, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7886146605014801, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 308.3984375, "epoch": 0.365234375, "grad_norm": 2.4070521070996658, "kl": 0.0474853515625, "learning_rate": 9.0869140625e-07, "loss": 0.0019, "reward": 1.7509766221046448, "reward_std": 0.07239764928817749, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7509766519069672, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 297.3203125, "epoch": 0.36572265625, "grad_norm": 2.3297506701189885, "kl": 0.0506591796875, "learning_rate": 9.085693359375e-07, "loss": 0.002, "reward": 1.7397636771202087, "reward_std": 0.07227146998047829, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7397636771202087, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 271.75, "epoch": 0.3662109375, "grad_norm": 2.4906923300155035, "kl": 0.049560546875, "learning_rate": 9.08447265625e-07, "loss": 0.002, "reward": 1.7281315326690674, "reward_std": 0.09530112892389297, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7281315326690674, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 311.296875, "epoch": 0.36669921875, "grad_norm": 2.095615441296317, "kl": 0.083740234375, "learning_rate": 9.083251953125e-07, "loss": 0.0033, "reward": 1.8014087677001953, "reward_std": 0.1527663916349411, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8014088273048401, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 374.421875, "epoch": 0.3671875, "grad_norm": 2.023204451439646, "kl": 0.05126953125, "learning_rate": 9.082031249999999e-07, "loss": 0.0021, "reward": 1.5856056809425354, "reward_std": 0.12917165458202362, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6090432107448578, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 308.1953125, "epoch": 0.36767578125, "grad_norm": 4.535857246107816, "kl": 0.05615234375, "learning_rate": 9.080810546874999e-07, "loss": 0.0022, "reward": 1.551247239112854, "reward_std": 0.18378467857837677, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5746847093105316, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 211.421875, "epoch": 0.3681640625, "grad_norm": 10.79203763732622, "kl": 0.1080322265625, "learning_rate": 9.079589843749999e-07, "loss": 0.0043, "reward": 1.7111627459526062, "reward_std": 0.11090904846787453, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7111627459526062, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 337.5859375, "epoch": 0.36865234375, "grad_norm": 4.366419277780804, "kl": 0.0460205078125, "learning_rate": 9.078369140625e-07, "loss": 0.0018, "reward": 1.5475510954856873, "reward_std": 0.1516926810145378, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5788010954856873, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 270.40625, "epoch": 0.369140625, "grad_norm": 2.2502977034316816, "kl": 0.0596923828125, "learning_rate": 9.0771484375e-07, "loss": 0.0024, "reward": 1.6759621500968933, "reward_std": 0.08574535697698593, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6837746500968933, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 304.6953125, "epoch": 0.36962890625, "grad_norm": 3.0394652562883557, "kl": 0.045166015625, "learning_rate": 9.075927734375e-07, "loss": 0.0018, "reward": 1.6703930497169495, "reward_std": 0.10942208580672741, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6782055497169495, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 202.7109375, "epoch": 0.3701171875, "grad_norm": 3.209213137923711, "kl": 0.06884765625, "learning_rate": 9.07470703125e-07, "loss": 0.0028, "reward": 1.6818158030509949, "reward_std": 0.06707624718546867, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6818158030509949, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 284.671875, "epoch": 0.37060546875, "grad_norm": 6.314452162326053, "kl": 0.0498046875, "learning_rate": 9.073486328124999e-07, "loss": 0.002, "reward": 1.7439817786216736, "reward_std": 0.06930938735604286, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7439817786216736, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 251.3359375, "epoch": 0.37109375, "grad_norm": 1.6588947555909435, "kl": 0.052490234375, "learning_rate": 9.072265624999999e-07, "loss": 0.0021, "reward": 1.7676947116851807, "reward_std": 0.06368311867117882, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7676947116851807, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 315.0625, "epoch": 0.37158203125, "grad_norm": 2.056420743637371, "kl": 0.0501708984375, "learning_rate": 9.071044921874999e-07, "loss": 0.002, "reward": 1.5630112886428833, "reward_std": 0.12552234530448914, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5864488482475281, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 276.34375, "epoch": 0.3720703125, "grad_norm": 2.008715753441554, "kl": 0.062255859375, "learning_rate": 9.06982421875e-07, "loss": 0.0025, "reward": 1.591386616230011, "reward_std": 0.11909160390496254, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.607011616230011, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 301.421875, "epoch": 0.37255859375, "grad_norm": 6.277949214463844, "kl": 0.05859375, "learning_rate": 9.068603515625e-07, "loss": 0.0023, "reward": 1.6171656847000122, "reward_std": 0.048231493681669235, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6171657145023346, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 318.671875, "epoch": 0.373046875, "grad_norm": 5.012112695502121, "kl": 0.0579833984375, "learning_rate": 9.0673828125e-07, "loss": 0.0023, "reward": 1.7434178590774536, "reward_std": 0.06155427545309067, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7434178590774536, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 323.7578125, "epoch": 0.37353515625, "grad_norm": 9.599500758693086, "kl": 0.0579833984375, "learning_rate": 9.066162109375e-07, "loss": 0.0023, "reward": 1.5466606616973877, "reward_std": 0.09597665816545486, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5544731467962265, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 236.09375, "epoch": 0.3740234375, "grad_norm": 7.306759003859089, "kl": 0.0556640625, "learning_rate": 9.064941406249999e-07, "loss": 0.0022, "reward": 1.763745129108429, "reward_std": 0.08324461057782173, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7793701589107513, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 257.3828125, "epoch": 0.37451171875, "grad_norm": 1.8312574638865227, "kl": 0.0537109375, "learning_rate": 9.063720703124999e-07, "loss": 0.0021, "reward": 1.7746469378471375, "reward_std": 0.04006502404808998, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7746469378471375, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 305.1953125, "epoch": 0.375, "grad_norm": 1.2098375787612152, "kl": 0.04638671875, "learning_rate": 9.0625e-07, "loss": 0.0019, "reward": 1.7166728377342224, "reward_std": 0.05403112433850765, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7166728675365448, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 287.984375, "epoch": 0.37548828125, "grad_norm": 8.565656666405385, "kl": 0.050537109375, "learning_rate": 9.061279296875e-07, "loss": 0.002, "reward": 1.6393229365348816, "reward_std": 0.08363521099090576, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6393230259418488, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 295.203125, "epoch": 0.3759765625, "grad_norm": 1.638069098109109, "kl": 0.0606689453125, "learning_rate": 9.06005859375e-07, "loss": 0.0024, "reward": 1.7592090964317322, "reward_std": 0.11197786778211594, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7670215368270874, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 326.5078125, "epoch": 0.37646484375, "grad_norm": 5.669521003197282, "kl": 0.046875, "learning_rate": 9.058837890625e-07, "loss": 0.0019, "reward": 1.7937055826187134, "reward_std": 0.04042255226522684, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7937055230140686, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 342.2421875, "epoch": 0.376953125, "grad_norm": 2.4131694725255524, "kl": 0.04736328125, "learning_rate": 9.057617187499999e-07, "loss": 0.0019, "reward": 1.754515528678894, "reward_std": 0.08119422942399979, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.762328028678894, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 316.125, "epoch": 0.37744140625, "grad_norm": 1.2158435987969012, "kl": 0.0474853515625, "learning_rate": 9.056396484374999e-07, "loss": 0.0019, "reward": 1.589455008506775, "reward_std": 0.15698669105768204, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6128924638032913, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 227.1875, "epoch": 0.3779296875, "grad_norm": 3.511854371924245, "kl": 0.064208984375, "learning_rate": 9.055175781249999e-07, "loss": 0.0026, "reward": 1.747837781906128, "reward_std": 0.047062634490430355, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7478377819061279, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 304.6875, "epoch": 0.37841796875, "grad_norm": 2.035616546962937, "kl": 0.0478515625, "learning_rate": 9.053955078125e-07, "loss": 0.0019, "reward": 1.7458081245422363, "reward_std": 0.1068628765642643, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7536205947399139, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 374.9296875, "epoch": 0.37890625, "grad_norm": 9.725323832534293, "kl": 0.052490234375, "learning_rate": 9.052734375e-07, "loss": 0.0021, "reward": 1.630423367023468, "reward_std": 0.16277416795492172, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.7085483372211456, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 365.5390625, "epoch": 0.37939453125, "grad_norm": 2.746445302407796, "kl": 0.056640625, "learning_rate": 9.051513671875e-07, "loss": 0.0023, "reward": 1.6005674600601196, "reward_std": 0.1270945593714714, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6083799302577972, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 332.7265625, "epoch": 0.3798828125, "grad_norm": 2.171791972938971, "kl": 0.0400390625, "learning_rate": 9.05029296875e-07, "loss": 0.0016, "reward": 1.71004056930542, "reward_std": 0.09835747629404068, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7100405395030975, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 340.8984375, "epoch": 0.38037109375, "grad_norm": 1.4601597679507083, "kl": 0.0455322265625, "learning_rate": 9.049072265624999e-07, "loss": 0.0018, "reward": 1.6127532720565796, "reward_std": 0.08331700228154659, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6127532124519348, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 273.890625, "epoch": 0.380859375, "grad_norm": 1.245741857334542, "kl": 0.0604248046875, "learning_rate": 9.047851562499999e-07, "loss": 0.0024, "reward": 1.7605129480361938, "reward_std": 0.0731951892375946, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7605129182338715, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 377.7578125, "epoch": 0.38134765625, "grad_norm": 1.9107655030172803, "kl": 0.042724609375, "learning_rate": 9.046630859375e-07, "loss": 0.0017, "reward": 1.7617830038070679, "reward_std": 0.1986825242638588, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7930330038070679, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 302.421875, "epoch": 0.3818359375, "grad_norm": 1.6052928141201563, "kl": 0.0528564453125, "learning_rate": 9.04541015625e-07, "loss": 0.0021, "reward": 1.6557468175888062, "reward_std": 0.1146387904882431, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6713717877864838, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 370.234375, "epoch": 0.38232421875, "grad_norm": 3.5152924375718095, "kl": 0.0445556640625, "learning_rate": 9.044189453125e-07, "loss": 0.0018, "reward": 1.7099875807762146, "reward_std": 0.10192125290632248, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7412375807762146, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 266.75, "epoch": 0.3828125, "grad_norm": 1.513013153822231, "kl": 0.059814453125, "learning_rate": 9.04296875e-07, "loss": 0.0024, "reward": 1.7009785175323486, "reward_std": 0.07997079566121101, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7087909579277039, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 255.765625, "epoch": 0.38330078125, "grad_norm": 1.1283536516038533, "kl": 0.0535888671875, "learning_rate": 9.041748046874999e-07, "loss": 0.0021, "reward": 1.795514464378357, "reward_std": 0.03493136540055275, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7955144643783569, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 380.125, "epoch": 0.3837890625, "grad_norm": 1.7312441195961588, "kl": 0.0504150390625, "learning_rate": 9.040527343749999e-07, "loss": 0.002, "reward": 1.606820523738861, "reward_std": 0.12857604026794434, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6224455237388611, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 418.40625, "epoch": 0.38427734375, "grad_norm": 2.4139628304327583, "kl": 0.0506591796875, "learning_rate": 9.039306640624999e-07, "loss": 0.002, "reward": 1.6258866786956787, "reward_std": 0.22364450991153717, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6883866786956787, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 322.34375, "epoch": 0.384765625, "grad_norm": 1.0442715242086436, "kl": 0.0576171875, "learning_rate": 9.0380859375e-07, "loss": 0.0023, "reward": 1.6823328137397766, "reward_std": 0.033012090250849724, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6823328137397766, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 393.1796875, "epoch": 0.38525390625, "grad_norm": 3.6998708553756336, "kl": 0.051025390625, "learning_rate": 9.036865234375e-07, "loss": 0.002, "reward": 1.6819748878479004, "reward_std": 0.10724844038486481, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.689787358045578, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 228.703125, "epoch": 0.3857421875, "grad_norm": 2.743911349386069, "kl": 0.0635986328125, "learning_rate": 9.03564453125e-07, "loss": 0.0025, "reward": 1.6518617272377014, "reward_std": 0.08327071741223335, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.651861697435379, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 269.3359375, "epoch": 0.38623046875, "grad_norm": 2.558950840214557, "kl": 0.06005859375, "learning_rate": 9.034423828125e-07, "loss": 0.0024, "reward": 1.8231335282325745, "reward_std": 0.0741860456764698, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8231335282325745, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 351.84375, "epoch": 0.38671875, "grad_norm": 2.495274964010672, "kl": 0.0565185546875, "learning_rate": 9.033203124999999e-07, "loss": 0.0023, "reward": 1.7564613819122314, "reward_std": 0.12410943582654, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7642738223075867, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 280.9453125, "epoch": 0.38720703125, "grad_norm": 2.7224758525530466, "kl": 0.0693359375, "learning_rate": 9.031982421874999e-07, "loss": 0.0028, "reward": 1.6794022917747498, "reward_std": 0.0662822276353836, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.679402232170105, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 304.828125, "epoch": 0.3876953125, "grad_norm": 2.008117882656302, "kl": 0.047119140625, "learning_rate": 9.03076171875e-07, "loss": 0.0019, "reward": 1.7753472328186035, "reward_std": 0.06647790595889091, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7753472328186035, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 341.9765625, "epoch": 0.38818359375, "grad_norm": 1.7981907802676043, "kl": 0.0562744140625, "learning_rate": 9.029541015625e-07, "loss": 0.0022, "reward": 1.6716668605804443, "reward_std": 0.10441191494464874, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6794794797897339, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 321.2109375, "epoch": 0.388671875, "grad_norm": 2.811941047744118, "kl": 0.0623779296875, "learning_rate": 9.0283203125e-07, "loss": 0.0025, "reward": 1.7318763136863708, "reward_std": 0.13041818886995316, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7396888434886932, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 324.8515625, "epoch": 0.38916015625, "grad_norm": 1.5360755449383108, "kl": 0.0634765625, "learning_rate": 9.027099609375e-07, "loss": 0.0025, "reward": 1.62107652425766, "reward_std": 0.11381476372480392, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6367015540599823, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 272.1484375, "epoch": 0.3896484375, "grad_norm": 2.2214500756443547, "kl": 0.06005859375, "learning_rate": 9.025878906249999e-07, "loss": 0.0024, "reward": 1.75680810213089, "reward_std": 0.12856251932680607, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7724330723285675, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 250.2734375, "epoch": 0.39013671875, "grad_norm": 6.470318010002288, "kl": 0.0697021484375, "learning_rate": 9.024658203124999e-07, "loss": 0.0028, "reward": 1.7349693775177002, "reward_std": 0.05125601589679718, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7349693477153778, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 365.96875, "epoch": 0.390625, "grad_norm": 4.746876315115609, "kl": 0.0447998046875, "learning_rate": 9.023437499999999e-07, "loss": 0.0018, "reward": 1.6792126893997192, "reward_std": 0.10291677340865135, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6948377192020416, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 229.828125, "epoch": 0.39111328125, "grad_norm": 5.930786243054409, "kl": 0.069091796875, "learning_rate": 9.022216796875e-07, "loss": 0.0028, "reward": 1.64633446931839, "reward_std": 0.03413202054798603, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6463344395160675, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 384.28125, "epoch": 0.3916015625, "grad_norm": 2.7295402764397503, "kl": 0.0416259765625, "learning_rate": 9.02099609375e-07, "loss": 0.0017, "reward": 1.7840456366539001, "reward_std": 0.11326225847005844, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7918581366539001, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 294.3828125, "epoch": 0.39208984375, "grad_norm": 2.061940610036514, "kl": 0.060302734375, "learning_rate": 9.019775390625e-07, "loss": 0.0024, "reward": 1.684591829776764, "reward_std": 0.039285000413656235, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6845918297767639, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 255.90625, "epoch": 0.392578125, "grad_norm": 2.6902536765952516, "kl": 0.0595703125, "learning_rate": 9.0185546875e-07, "loss": 0.0024, "reward": 1.6753877997398376, "reward_std": 0.0330571923404932, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6753877997398376, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 229.28125, "epoch": 0.39306640625, "grad_norm": 1.8411761165738731, "kl": 0.0643310546875, "learning_rate": 9.017333984374999e-07, "loss": 0.0026, "reward": 1.7131445407867432, "reward_std": 0.043391112238168716, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7131445109844208, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 269.6640625, "epoch": 0.3935546875, "grad_norm": 2.7482643969953817, "kl": 0.05810546875, "learning_rate": 9.016113281249999e-07, "loss": 0.0023, "reward": 1.7672026753425598, "reward_std": 0.14146682620048523, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7672027349472046, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 243.8984375, "epoch": 0.39404296875, "grad_norm": 5.7000530933748434, "kl": 0.0579833984375, "learning_rate": 9.014892578125e-07, "loss": 0.0023, "reward": 1.7555674314498901, "reward_std": 0.0818701907992363, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7555674016475677, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 316.0234375, "epoch": 0.39453125, "grad_norm": 2.716019769835689, "kl": 0.0574951171875, "learning_rate": 9.013671875e-07, "loss": 0.0023, "reward": 1.7889222502708435, "reward_std": 0.07327684760093689, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7889222204685211, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 379.015625, "epoch": 0.39501953125, "grad_norm": 1.0051393363186003, "kl": 0.046142578125, "learning_rate": 9.012451171875e-07, "loss": 0.0018, "reward": 1.823382318019867, "reward_std": 0.08756531029939651, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8311948478221893, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 336.9609375, "epoch": 0.3955078125, "grad_norm": 5.605975136552543, "kl": 0.0535888671875, "learning_rate": 9.01123046875e-07, "loss": 0.0021, "reward": 1.6305594444274902, "reward_std": 0.18838153779506683, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6774344146251678, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 271.0390625, "epoch": 0.39599609375, "grad_norm": 2.347124532736606, "kl": 0.067626953125, "learning_rate": 9.010009765624999e-07, "loss": 0.0027, "reward": 1.6546881794929504, "reward_std": 0.043467432260513306, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6546881794929504, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 317.421875, "epoch": 0.396484375, "grad_norm": 5.529194590040078, "kl": 0.0498046875, "learning_rate": 9.008789062499999e-07, "loss": 0.002, "reward": 1.9438674449920654, "reward_std": 0.1620844528079033, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.9516799449920654, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 384.53125, "epoch": 0.39697265625, "grad_norm": 4.064564658886712, "kl": 0.0567626953125, "learning_rate": 9.007568359374999e-07, "loss": 0.0023, "reward": 1.7015685439109802, "reward_std": 0.19768846035003662, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7250060141086578, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 363.6015625, "epoch": 0.3974609375, "grad_norm": 3.3533767553179086, "kl": 0.0438232421875, "learning_rate": 9.00634765625e-07, "loss": 0.0018, "reward": 1.708820104598999, "reward_std": 0.19447695463895798, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7400700449943542, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 284.9765625, "epoch": 0.39794921875, "grad_norm": 2.6790906488328483, "kl": 0.044921875, "learning_rate": 9.005126953125e-07, "loss": 0.0018, "reward": 1.721143901348114, "reward_std": 0.06836184859275818, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.721143901348114, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 224.3203125, "epoch": 0.3984375, "grad_norm": 1.1009725102471428, "kl": 0.0548095703125, "learning_rate": 9.00390625e-07, "loss": 0.0022, "reward": 1.8940476775169373, "reward_std": 0.03376696538180113, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8940476477146149, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 271.0, "epoch": 0.39892578125, "grad_norm": 14.243353602718932, "kl": 0.05517578125, "learning_rate": 9.002685546875e-07, "loss": 0.0022, "reward": 1.7576437592506409, "reward_std": 0.08673252165317535, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7576436996459961, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 280.03125, "epoch": 0.3994140625, "grad_norm": 4.958911702356191, "kl": 0.0601806640625, "learning_rate": 9.001464843749999e-07, "loss": 0.0024, "reward": 1.7922693490982056, "reward_std": 0.07825984340161085, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8000818490982056, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 331.109375, "epoch": 0.39990234375, "grad_norm": 1.3155145245659365, "kl": 0.050537109375, "learning_rate": 9.000244140624999e-07, "loss": 0.002, "reward": 1.6974967122077942, "reward_std": 0.11296156048774719, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7053092420101166, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 345.65625, "epoch": 0.400390625, "grad_norm": 2.13339913891259, "kl": 0.055419921875, "learning_rate": 8.9990234375e-07, "loss": 0.0022, "reward": 1.707070529460907, "reward_std": 0.12426425144076347, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7148829698562622, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 284.984375, "epoch": 0.40087890625, "grad_norm": 3.546816241737904, "kl": 0.063232421875, "learning_rate": 8.997802734375e-07, "loss": 0.0025, "reward": 1.7018118500709534, "reward_std": 0.15557154268026352, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.717436820268631, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 321.2421875, "epoch": 0.4013671875, "grad_norm": 2.7078843795022296, "kl": 0.05029296875, "learning_rate": 8.99658203125e-07, "loss": 0.002, "reward": 1.6248914003372192, "reward_std": 0.09886835888028145, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.624891385436058, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 322.65625, "epoch": 0.40185546875, "grad_norm": 1.1173934118480147, "kl": 0.056396484375, "learning_rate": 8.995361328125e-07, "loss": 0.0023, "reward": 1.670366883277893, "reward_std": 0.022474923171103, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6703668832778931, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 293.5546875, "epoch": 0.40234375, "grad_norm": 1.1487567312767635, "kl": 0.0540771484375, "learning_rate": 8.994140624999999e-07, "loss": 0.0022, "reward": 1.6762340068817139, "reward_std": 0.06469432264566422, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6762339472770691, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 305.2578125, "epoch": 0.40283203125, "grad_norm": 0.47294641259340225, "kl": 0.041015625, "learning_rate": 8.992919921874999e-07, "loss": 0.0016, "reward": 1.9129234552383423, "reward_std": 0.011541639920324087, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9129234850406647, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 365.625, "epoch": 0.4033203125, "grad_norm": 2.0908075256979757, "kl": 0.0526123046875, "learning_rate": 8.991699218749999e-07, "loss": 0.0021, "reward": 1.5521747469902039, "reward_std": 0.1392434984445572, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.5990498065948486, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 315.71875, "epoch": 0.40380859375, "grad_norm": 1.0870502036918956, "kl": 0.0484619140625, "learning_rate": 8.990478515625e-07, "loss": 0.0019, "reward": 1.6554855108261108, "reward_std": 0.15189751982688904, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6867355108261108, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 337.6640625, "epoch": 0.404296875, "grad_norm": 1.5086234778630288, "kl": 0.0494384765625, "learning_rate": 8.9892578125e-07, "loss": 0.002, "reward": 1.5656054019927979, "reward_std": 0.13260553404688835, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5890428274869919, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 270.6328125, "epoch": 0.40478515625, "grad_norm": 3.1905532448295366, "kl": 0.054931640625, "learning_rate": 8.988037109375e-07, "loss": 0.0022, "reward": 1.792038083076477, "reward_std": 0.08523118868470192, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7920379936695099, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 272.6484375, "epoch": 0.4052734375, "grad_norm": 1.4703480432855132, "kl": 0.052978515625, "learning_rate": 8.98681640625e-07, "loss": 0.0021, "reward": 1.7558820843696594, "reward_std": 0.04787625931203365, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7558820843696594, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 319.6484375, "epoch": 0.40576171875, "grad_norm": 7.185703738083162, "kl": 0.04248046875, "learning_rate": 8.985595703124999e-07, "loss": 0.0017, "reward": 1.8221890926361084, "reward_std": 0.05866616778075695, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8221890926361084, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 386.2890625, "epoch": 0.40625, "grad_norm": 2.145560956292342, "kl": 0.063232421875, "learning_rate": 8.984374999999999e-07, "loss": 0.0025, "reward": 1.685830295085907, "reward_std": 0.036165340803563595, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6858302354812622, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 254.8515625, "epoch": 0.40673828125, "grad_norm": 3.391928796703693, "kl": 0.0562744140625, "learning_rate": 8.983154296875e-07, "loss": 0.0023, "reward": 1.6803425550460815, "reward_std": 0.10987947136163712, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6803425848484039, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 365.0703125, "epoch": 0.4072265625, "grad_norm": 2.2044595955803206, "kl": 0.043701171875, "learning_rate": 8.98193359375e-07, "loss": 0.0017, "reward": 1.7496721744537354, "reward_std": 0.12323963642120361, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.757484644651413, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 247.0546875, "epoch": 0.40771484375, "grad_norm": 3.3613379939285344, "kl": 0.0556640625, "learning_rate": 8.980712890625e-07, "loss": 0.0022, "reward": 1.6558890342712402, "reward_std": 0.05606374144554138, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.655889093875885, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 249.203125, "epoch": 0.408203125, "grad_norm": 1.3604816411152902, "kl": 0.0491943359375, "learning_rate": 8.9794921875e-07, "loss": 0.002, "reward": 1.8900187015533447, "reward_std": 0.06370699405670166, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8900187015533447, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 348.5625, "epoch": 0.40869140625, "grad_norm": 2.3095956319983113, "kl": 0.0531005859375, "learning_rate": 8.978271484374999e-07, "loss": 0.0021, "reward": 1.5155598521232605, "reward_std": 0.11974064260721207, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.5546222925186157, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 288.6015625, "epoch": 0.4091796875, "grad_norm": 1.6895406595858493, "kl": 0.05511474609375, "learning_rate": 8.977050781249999e-07, "loss": 0.0022, "reward": 1.7988107204437256, "reward_std": 0.04724998027086258, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7988106906414032, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 314.4609375, "epoch": 0.40966796875, "grad_norm": 38.657918876895465, "kl": 0.05810546875, "learning_rate": 8.975830078124999e-07, "loss": 0.0023, "reward": 1.707718014717102, "reward_std": 0.056786952540278435, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.707718014717102, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 352.0546875, "epoch": 0.41015625, "grad_norm": 3.54984380280518, "kl": 0.043701171875, "learning_rate": 8.974609375e-07, "loss": 0.0017, "reward": 1.6362690329551697, "reward_std": 0.046866053715348244, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6362690329551697, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 360.84375, "epoch": 0.41064453125, "grad_norm": 4.880454547688104, "kl": 0.0343017578125, "learning_rate": 8.973388671875e-07, "loss": 0.0014, "reward": 1.6713696718215942, "reward_std": 0.07554645650088787, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6791820526123047, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 259.75, "epoch": 0.4111328125, "grad_norm": 3.5422070331713558, "kl": 0.047607421875, "learning_rate": 8.97216796875e-07, "loss": 0.0019, "reward": 1.76535165309906, "reward_std": 0.07970313355326653, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7731641829013824, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 342.625, "epoch": 0.41162109375, "grad_norm": 1.6668469246862412, "kl": 0.0472412109375, "learning_rate": 8.970947265625e-07, "loss": 0.0019, "reward": 1.647118866443634, "reward_std": 0.15605220571160316, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6861813068389893, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 318.8125, "epoch": 0.412109375, "grad_norm": 1.4480209975960932, "kl": 0.036865234375, "learning_rate": 8.969726562499999e-07, "loss": 0.0015, "reward": 1.7060487270355225, "reward_std": 0.19439689815044403, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7372986376285553, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 351.375, "epoch": 0.41259765625, "grad_norm": 2.6692780266475387, "kl": 0.05126953125, "learning_rate": 8.968505859374999e-07, "loss": 0.0021, "reward": 1.7914454340934753, "reward_std": 0.07013567723333836, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7914454638957977, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 440.234375, "epoch": 0.4130859375, "grad_norm": 1.4785225765927579, "kl": 0.0511474609375, "learning_rate": 8.96728515625e-07, "loss": 0.002, "reward": 1.6806397438049316, "reward_std": 0.09639265388250351, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6884523034095764, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 331.46875, "epoch": 0.41357421875, "grad_norm": 0.978438992258654, "kl": 0.0433349609375, "learning_rate": 8.966064453125e-07, "loss": 0.0017, "reward": 1.7947252988815308, "reward_std": 0.12237262353301048, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8103502690792084, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 284.859375, "epoch": 0.4140625, "grad_norm": 2.8553868780442206, "kl": 0.053466796875, "learning_rate": 8.96484375e-07, "loss": 0.0021, "reward": 1.7657200694084167, "reward_std": 0.12218839675188065, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7891575396060944, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 270.046875, "epoch": 0.41455078125, "grad_norm": 1.2336624642234275, "kl": 0.06689453125, "learning_rate": 8.963623046875e-07, "loss": 0.0027, "reward": 1.6515643000602722, "reward_std": 0.09395093843340874, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.675001859664917, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 289.1015625, "epoch": 0.4150390625, "grad_norm": 1.6078249545004046, "kl": 0.058837890625, "learning_rate": 8.96240234375e-07, "loss": 0.0024, "reward": 1.6375129222869873, "reward_std": 0.09645092487335205, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6375128775835037, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 259.734375, "epoch": 0.41552734375, "grad_norm": 1.7476214032090949, "kl": 0.0460205078125, "learning_rate": 8.961181640624999e-07, "loss": 0.0018, "reward": 1.8477584719657898, "reward_std": 0.020351408515125513, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8477585017681122, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 277.203125, "epoch": 0.416015625, "grad_norm": 1.538198841160422, "kl": 0.0655517578125, "learning_rate": 8.959960937499999e-07, "loss": 0.0026, "reward": 1.731358528137207, "reward_std": 0.09056920558214188, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.731358528137207, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 303.8671875, "epoch": 0.41650390625, "grad_norm": 5.895599134478584, "kl": 0.069091796875, "learning_rate": 8.958740234375e-07, "loss": 0.0028, "reward": 1.6020338535308838, "reward_std": 0.1456664614379406, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.617658793926239, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 264.546875, "epoch": 0.4169921875, "grad_norm": 9.469721145412421, "kl": 0.056396484375, "learning_rate": 8.95751953125e-07, "loss": 0.0023, "reward": 1.7787832021713257, "reward_std": 0.025636928156018257, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7787831723690033, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 304.7265625, "epoch": 0.41748046875, "grad_norm": 1.667838887256776, "kl": 0.0594482421875, "learning_rate": 8.956298828125e-07, "loss": 0.0024, "reward": 1.8459165692329407, "reward_std": 0.11014392226934433, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8537290096282959, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 329.84375, "epoch": 0.41796875, "grad_norm": 4.431806930390354, "kl": 0.0511474609375, "learning_rate": 8.955078125e-07, "loss": 0.002, "reward": 1.7043548822402954, "reward_std": 0.10635066404938698, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.712167501449585, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 353.4921875, "epoch": 0.41845703125, "grad_norm": 2.770248233041983, "kl": 0.0433349609375, "learning_rate": 8.953857421874999e-07, "loss": 0.0017, "reward": 1.7795735597610474, "reward_std": 0.12282518297433853, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.795198529958725, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 275.84375, "epoch": 0.4189453125, "grad_norm": 2.12159812965168, "kl": 0.064697265625, "learning_rate": 8.952636718749999e-07, "loss": 0.0026, "reward": 1.587533950805664, "reward_std": 0.1150995921343565, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6344089508056641, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 300.8828125, "epoch": 0.41943359375, "grad_norm": 1.8214435810008032, "kl": 0.0404052734375, "learning_rate": 8.951416015624999e-07, "loss": 0.0016, "reward": 1.7444366216659546, "reward_std": 0.11982932686805725, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7600616216659546, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 363.28125, "epoch": 0.419921875, "grad_norm": 1.745129886806224, "kl": 0.0435791015625, "learning_rate": 8.9501953125e-07, "loss": 0.0017, "reward": 1.84866863489151, "reward_std": 0.039832524955272675, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8486685752868652, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 302.9609375, "epoch": 0.42041015625, "grad_norm": 2.169553869726486, "kl": 0.0478515625, "learning_rate": 8.948974609375e-07, "loss": 0.0019, "reward": 1.6781877279281616, "reward_std": 0.11633714661002159, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7094376981258392, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 229.546875, "epoch": 0.4208984375, "grad_norm": 2.890081333787121, "kl": 0.05615234375, "learning_rate": 8.94775390625e-07, "loss": 0.0022, "reward": 1.7455247640609741, "reward_std": 0.09918822348117828, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7455247640609741, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 297.1796875, "epoch": 0.42138671875, "grad_norm": 1.885623336317658, "kl": 0.054443359375, "learning_rate": 8.946533203125e-07, "loss": 0.0022, "reward": 1.7564916610717773, "reward_std": 0.12973085790872574, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7721166908740997, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 334.8359375, "epoch": 0.421875, "grad_norm": 10.017435172797613, "kl": 0.1231689453125, "learning_rate": 8.945312499999999e-07, "loss": 0.0049, "reward": 1.6419134140014648, "reward_std": 0.15503490716218948, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6653508841991425, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 332.3359375, "epoch": 0.42236328125, "grad_norm": 5.291462812901847, "kl": 0.0546875, "learning_rate": 8.944091796874999e-07, "loss": 0.0022, "reward": 1.619509994983673, "reward_std": 0.1739499308168888, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6429474651813507, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 332.203125, "epoch": 0.4228515625, "grad_norm": 3.9430144083478615, "kl": 0.0474853515625, "learning_rate": 8.94287109375e-07, "loss": 0.0019, "reward": 1.7385854721069336, "reward_std": 0.03805091604590416, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7385854721069336, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 308.59375, "epoch": 0.42333984375, "grad_norm": 1.6092457486819296, "kl": 0.04736328125, "learning_rate": 8.941650390625e-07, "loss": 0.0019, "reward": 1.8048319220542908, "reward_std": 0.06229471415281296, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8048319518566132, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 400.0546875, "epoch": 0.423828125, "grad_norm": 1.9104697019990664, "kl": 0.0382080078125, "learning_rate": 8.9404296875e-07, "loss": 0.0015, "reward": 1.6721869707107544, "reward_std": 0.2335866540670395, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7268744707107544, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 304.390625, "epoch": 0.42431640625, "grad_norm": 1.4769192488084104, "kl": 0.0577392578125, "learning_rate": 8.939208984375e-07, "loss": 0.0023, "reward": 1.7876529693603516, "reward_std": 0.12172145396471024, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8032780587673187, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 266.2265625, "epoch": 0.4248046875, "grad_norm": 2.4912707719685816, "kl": 0.063232421875, "learning_rate": 8.937988281249999e-07, "loss": 0.0025, "reward": 1.7117069959640503, "reward_std": 0.10087519139051437, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7195195257663727, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 268.890625, "epoch": 0.42529296875, "grad_norm": 2.904180822248156, "kl": 0.0533447265625, "learning_rate": 8.936767578124999e-07, "loss": 0.0021, "reward": 1.6977179646492004, "reward_std": 0.061658382415771484, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.697717934846878, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 302.3203125, "epoch": 0.42578125, "grad_norm": 3.9810543136057706, "kl": 0.05419921875, "learning_rate": 8.935546874999999e-07, "loss": 0.0022, "reward": 1.778558611869812, "reward_std": 0.13054338097572327, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7941837012767792, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 258.5, "epoch": 0.42626953125, "grad_norm": 2.638694510235747, "kl": 0.0634765625, "learning_rate": 8.934326171875e-07, "loss": 0.0025, "reward": 1.684062123298645, "reward_std": 0.1073136255145073, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6918745934963226, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 320.8359375, "epoch": 0.4267578125, "grad_norm": 1.0328136935198113, "kl": 0.040283203125, "learning_rate": 8.93310546875e-07, "loss": 0.0016, "reward": 1.7540948987007141, "reward_std": 0.057166170328855515, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7540949583053589, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 274.1953125, "epoch": 0.42724609375, "grad_norm": 7.360158040184964, "kl": 0.0521240234375, "learning_rate": 8.931884765625e-07, "loss": 0.0021, "reward": 1.7804943919181824, "reward_std": 0.02563006430864334, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7804943323135376, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 249.265625, "epoch": 0.427734375, "grad_norm": 2.550928083120494, "kl": 0.070556640625, "learning_rate": 8.9306640625e-07, "loss": 0.0028, "reward": 1.721437394618988, "reward_std": 0.09169731847941875, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.721437394618988, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 333.578125, "epoch": 0.42822265625, "grad_norm": 4.314294285455361, "kl": 0.0531005859375, "learning_rate": 8.929443359374999e-07, "loss": 0.0021, "reward": 1.7071447968482971, "reward_std": 0.12683348171412945, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7227697968482971, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 291.53125, "epoch": 0.4287109375, "grad_norm": 7.896509785870648, "kl": 0.0496826171875, "learning_rate": 8.928222656249999e-07, "loss": 0.002, "reward": 1.698940396308899, "reward_std": 0.06823573168367147, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6989404261112213, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 302.1953125, "epoch": 0.42919921875, "grad_norm": 4.419894880080495, "kl": 0.0576171875, "learning_rate": 8.927001953125e-07, "loss": 0.0023, "reward": 1.6944845914840698, "reward_std": 0.13580431789159775, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7022970914840698, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 340.640625, "epoch": 0.4296875, "grad_norm": 3.1624664713185777, "kl": 0.0535888671875, "learning_rate": 8.92578125e-07, "loss": 0.0021, "reward": 1.669293999671936, "reward_std": 0.1342175379395485, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.669293999671936, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 256.4453125, "epoch": 0.43017578125, "grad_norm": 2.394405821979668, "kl": 0.0677490234375, "learning_rate": 8.924560546875e-07, "loss": 0.0027, "reward": 1.7102238535881042, "reward_std": 0.07026012241840363, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7180363833904266, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 371.7578125, "epoch": 0.4306640625, "grad_norm": 1.9581328307353232, "kl": 0.0467529296875, "learning_rate": 8.92333984375e-07, "loss": 0.0019, "reward": 1.7540261149406433, "reward_std": 0.10421252250671387, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7618384957313538, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 366.375, "epoch": 0.43115234375, "grad_norm": 2.6355277149440064, "kl": 0.0487060546875, "learning_rate": 8.922119140624999e-07, "loss": 0.0019, "reward": 1.5595695972442627, "reward_std": 0.058571480214595795, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5595695376396179, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 369.453125, "epoch": 0.431640625, "grad_norm": 2.4619908418967618, "kl": 0.0504150390625, "learning_rate": 8.920898437499999e-07, "loss": 0.002, "reward": 1.718446969985962, "reward_std": 0.15201827883720398, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7496969699859619, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 301.796875, "epoch": 0.43212890625, "grad_norm": 1.9043119997675124, "kl": 0.0589599609375, "learning_rate": 8.919677734374999e-07, "loss": 0.0024, "reward": 1.6135079860687256, "reward_std": 0.0632172767072916, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6135080456733704, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 276.9453125, "epoch": 0.4326171875, "grad_norm": 2.828936256887094, "kl": 0.0567626953125, "learning_rate": 8.91845703125e-07, "loss": 0.0023, "reward": 1.8562658429145813, "reward_std": 0.043327707797288895, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8562657833099365, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 327.5703125, "epoch": 0.43310546875, "grad_norm": 3.794004727125142, "kl": 0.061767578125, "learning_rate": 8.917236328125e-07, "loss": 0.0025, "reward": 1.6978505849838257, "reward_std": 0.09938319772481918, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6978505551815033, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 247.9375, "epoch": 0.43359375, "grad_norm": 1.7545127538372571, "kl": 0.050048828125, "learning_rate": 8.916015625e-07, "loss": 0.002, "reward": 1.7462196350097656, "reward_std": 0.08687572181224823, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7540321350097656, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 322.984375, "epoch": 0.43408203125, "grad_norm": 4.59557739433223, "kl": 0.053466796875, "learning_rate": 8.914794921875e-07, "loss": 0.0021, "reward": 1.664870023727417, "reward_std": 0.13343672454357147, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.680495023727417, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 368.6796875, "epoch": 0.4345703125, "grad_norm": 3.0527472360411747, "kl": 0.0570068359375, "learning_rate": 8.913574218749999e-07, "loss": 0.0023, "reward": 1.633752703666687, "reward_std": 0.2018553614616394, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.657190203666687, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 230.8671875, "epoch": 0.43505859375, "grad_norm": 5.7762219418155025, "kl": 0.05517578125, "learning_rate": 8.912353515624999e-07, "loss": 0.0022, "reward": 1.684194028377533, "reward_std": 0.09600569307804108, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6841940879821777, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 343.671875, "epoch": 0.435546875, "grad_norm": 3.3065383732267257, "kl": 0.0523681640625, "learning_rate": 8.9111328125e-07, "loss": 0.0021, "reward": 1.6547590494155884, "reward_std": 0.19414672255516052, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7094466388225555, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 322.4921875, "epoch": 0.43603515625, "grad_norm": 1.4003093807212736, "kl": 0.063720703125, "learning_rate": 8.909912109375e-07, "loss": 0.0025, "reward": 1.7039158940315247, "reward_std": 0.11482829600572586, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7117283940315247, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 246.6640625, "epoch": 0.4365234375, "grad_norm": 1.3160510217088606, "kl": 0.0574951171875, "learning_rate": 8.90869140625e-07, "loss": 0.0023, "reward": 1.833851397037506, "reward_std": 0.026355454698204994, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8338513970375061, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 250.1171875, "epoch": 0.43701171875, "grad_norm": 1.099488871794088, "kl": 0.0526123046875, "learning_rate": 8.907470703125e-07, "loss": 0.0021, "reward": 1.735145926475525, "reward_std": 0.050868917256593704, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7351458072662354, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 290.3515625, "epoch": 0.4375, "grad_norm": 10.520352255307145, "kl": 0.049560546875, "learning_rate": 8.906249999999999e-07, "loss": 0.002, "reward": 1.5713690519332886, "reward_std": 0.17703481018543243, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5869940519332886, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 255.0, "epoch": 0.43798828125, "grad_norm": 4.9102977822696525, "kl": 0.059814453125, "learning_rate": 8.905029296874999e-07, "loss": 0.0024, "reward": 1.6745886206626892, "reward_std": 0.056372467428445816, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6745886504650116, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 316.609375, "epoch": 0.4384765625, "grad_norm": 2.850983593865307, "kl": 0.04638671875, "learning_rate": 8.903808593749999e-07, "loss": 0.0019, "reward": 1.7233811616897583, "reward_std": 0.07739730924367905, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7311936020851135, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 311.828125, "epoch": 0.43896484375, "grad_norm": 2.0785423808690977, "kl": 0.0546875, "learning_rate": 8.902587890625e-07, "loss": 0.0022, "reward": 1.764865517616272, "reward_std": 0.06689143739640713, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7648655772209167, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 359.4765625, "epoch": 0.439453125, "grad_norm": 3.4689071084431946, "kl": 0.0462646484375, "learning_rate": 8.9013671875e-07, "loss": 0.0018, "reward": 1.6617870926856995, "reward_std": 0.1315966732800007, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6852246224880219, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 366.421875, "epoch": 0.43994140625, "grad_norm": 1.5582536488441514, "kl": 0.0484619140625, "learning_rate": 8.900146484375e-07, "loss": 0.0019, "reward": 1.6939795017242432, "reward_std": 0.1498698815703392, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7252295911312103, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 241.890625, "epoch": 0.4404296875, "grad_norm": 3.2999043304034026, "kl": 0.0615234375, "learning_rate": 8.89892578125e-07, "loss": 0.0025, "reward": 1.6509242057800293, "reward_std": 0.10151878371834755, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6509242355823517, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 363.03125, "epoch": 0.44091796875, "grad_norm": 16.23235566949286, "kl": 0.0509033203125, "learning_rate": 8.897705078124999e-07, "loss": 0.002, "reward": 1.7497307658195496, "reward_std": 0.06852127611637115, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7497306764125824, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 353.1875, "epoch": 0.44140625, "grad_norm": 2.539616901726096, "kl": 0.0562744140625, "learning_rate": 8.896484374999999e-07, "loss": 0.0023, "reward": 1.6935822367668152, "reward_std": 0.14617926999926567, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7170197069644928, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 369.1015625, "epoch": 0.44189453125, "grad_norm": 1.9069482845643857, "kl": 0.050048828125, "learning_rate": 8.895263671875e-07, "loss": 0.002, "reward": 1.6976945996284485, "reward_std": 0.10638157278299332, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7055070698261261, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 265.7734375, "epoch": 0.4423828125, "grad_norm": 3.849069538906998, "kl": 0.0516357421875, "learning_rate": 8.89404296875e-07, "loss": 0.0021, "reward": 1.7132678627967834, "reward_std": 0.138364490121603, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7132679224014282, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 208.0390625, "epoch": 0.44287109375, "grad_norm": 2.305802120564667, "kl": 0.0523681640625, "learning_rate": 8.892822265625e-07, "loss": 0.0021, "reward": 1.7869673371315002, "reward_std": 0.07257464155554771, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7869673371315002, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 356.8671875, "epoch": 0.443359375, "grad_norm": 26.058746364247334, "kl": 0.055908203125, "learning_rate": 8.8916015625e-07, "loss": 0.0022, "reward": 1.628940463066101, "reward_std": 0.13616503030061722, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6445655226707458, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 262.578125, "epoch": 0.44384765625, "grad_norm": 2.122149008743692, "kl": 0.048095703125, "learning_rate": 8.890380859374999e-07, "loss": 0.0019, "reward": 1.7516308426856995, "reward_std": 0.0674322908744216, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7594433426856995, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 294.0546875, "epoch": 0.4443359375, "grad_norm": 4.580725380018095, "kl": 0.0489501953125, "learning_rate": 8.889160156249999e-07, "loss": 0.002, "reward": 1.730670690536499, "reward_std": 0.11633214727044106, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7462956309318542, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 324.3828125, "epoch": 0.44482421875, "grad_norm": 1.320650252057893, "kl": 0.0423583984375, "learning_rate": 8.887939453124999e-07, "loss": 0.0017, "reward": 1.7391434907913208, "reward_std": 0.19686751067638397, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7547684013843536, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 329.6328125, "epoch": 0.4453125, "grad_norm": 1.4523590967417215, "kl": 0.0565185546875, "learning_rate": 8.88671875e-07, "loss": 0.0023, "reward": 1.5424267649650574, "reward_std": 0.07458284497261047, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.550239235162735, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 314.3515625, "epoch": 0.44580078125, "grad_norm": 1.719086973866284, "kl": 0.052734375, "learning_rate": 8.885498046875e-07, "loss": 0.0021, "reward": 1.642267882823944, "reward_std": 0.12693988159298897, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6657053828239441, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 307.3359375, "epoch": 0.4462890625, "grad_norm": 3.2810083273339217, "kl": 0.0501708984375, "learning_rate": 8.88427734375e-07, "loss": 0.002, "reward": 1.6578189134597778, "reward_std": 0.14238969795405865, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6890688836574554, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 257.0546875, "epoch": 0.44677734375, "grad_norm": 2.701614835256679, "kl": 0.0416259765625, "learning_rate": 8.883056640625e-07, "loss": 0.0017, "reward": 1.8191250562667847, "reward_std": 0.08679736405611038, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8191250264644623, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 328.2890625, "epoch": 0.447265625, "grad_norm": 4.414560910867886, "kl": 0.0560302734375, "learning_rate": 8.881835937499999e-07, "loss": 0.0022, "reward": 1.700971245765686, "reward_std": 0.0659741573035717, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7009712755680084, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 363.6953125, "epoch": 0.44775390625, "grad_norm": 1.7777346701649772, "kl": 0.041259765625, "learning_rate": 8.880615234374999e-07, "loss": 0.0017, "reward": 1.7923877239227295, "reward_std": 0.052391206845641136, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7923877835273743, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 277.9921875, "epoch": 0.4482421875, "grad_norm": 0.9486222650729225, "kl": 0.0435791015625, "learning_rate": 8.87939453125e-07, "loss": 0.0017, "reward": 1.6959292888641357, "reward_std": 0.02422085404396057, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6959293782711029, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 366.7734375, "epoch": 0.44873046875, "grad_norm": 1.6490253942008113, "kl": 0.052490234375, "learning_rate": 8.878173828125e-07, "loss": 0.0021, "reward": 1.5897186398506165, "reward_std": 0.13733144104480743, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5975310802459717, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 328.1015625, "epoch": 0.44921875, "grad_norm": 1.4659485502265033, "kl": 0.0418701171875, "learning_rate": 8.876953125e-07, "loss": 0.0017, "reward": 1.6723748445510864, "reward_std": 0.09266382362693548, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6723748296499252, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 381.0234375, "epoch": 0.44970703125, "grad_norm": 2.0567450128282423, "kl": 0.0418701171875, "learning_rate": 8.875732421875e-07, "loss": 0.0017, "reward": 1.635401725769043, "reward_std": 0.11173927411437035, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6432141959667206, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 338.125, "epoch": 0.4501953125, "grad_norm": 6.406156350449351, "kl": 0.041748046875, "learning_rate": 8.874511718749999e-07, "loss": 0.0017, "reward": 1.7291421294212341, "reward_std": 0.051562756299972534, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7291421294212341, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 304.4765625, "epoch": 0.45068359375, "grad_norm": 2.9529489375072915, "kl": 0.0458984375, "learning_rate": 8.873291015624999e-07, "loss": 0.0018, "reward": 1.7142540216445923, "reward_std": 0.157925084233284, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7298789620399475, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 269.0546875, "epoch": 0.451171875, "grad_norm": 0.9225384580987055, "kl": 0.059326171875, "learning_rate": 8.872070312499999e-07, "loss": 0.0024, "reward": 1.7622966170310974, "reward_std": 0.06992994248867035, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7701090574264526, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 352.0703125, "epoch": 0.45166015625, "grad_norm": 0.8543291431386741, "kl": 0.0465087890625, "learning_rate": 8.870849609375e-07, "loss": 0.0019, "reward": 1.664566159248352, "reward_std": 0.04729248210787773, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.664566159248352, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 279.671875, "epoch": 0.4521484375, "grad_norm": 4.361104873897493, "kl": 0.051025390625, "learning_rate": 8.86962890625e-07, "loss": 0.002, "reward": 1.6637241840362549, "reward_std": 0.06688250973820686, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6637241840362549, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 351.9765625, "epoch": 0.45263671875, "grad_norm": 1.321086242390277, "kl": 0.0428466796875, "learning_rate": 8.868408203125e-07, "loss": 0.0017, "reward": 1.609758734703064, "reward_std": 0.1022709459066391, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6331963092088699, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 269.4140625, "epoch": 0.453125, "grad_norm": 4.639254611969434, "kl": 0.0499267578125, "learning_rate": 8.8671875e-07, "loss": 0.002, "reward": 1.7151271104812622, "reward_std": 0.056340851821005344, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7151271104812622, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 252.8671875, "epoch": 0.45361328125, "grad_norm": 9.59117348810988, "kl": 0.044921875, "learning_rate": 8.865966796874999e-07, "loss": 0.0018, "reward": 1.7646648287773132, "reward_std": 0.08957374095916748, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7724774181842804, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 401.25, "epoch": 0.4541015625, "grad_norm": 1.774508659785477, "kl": 0.0496826171875, "learning_rate": 8.864746093749999e-07, "loss": 0.002, "reward": 1.6190925240516663, "reward_std": 0.1272077076137066, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6269050240516663, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 363.6953125, "epoch": 0.45458984375, "grad_norm": 1.9603229874947314, "kl": 0.0439453125, "learning_rate": 8.863525390625e-07, "loss": 0.0018, "reward": 1.7904832363128662, "reward_std": 0.0836594682186842, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7982957363128662, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 353.421875, "epoch": 0.455078125, "grad_norm": 3.3693206055984968, "kl": 0.0513916015625, "learning_rate": 8.8623046875e-07, "loss": 0.0021, "reward": 1.662086844444275, "reward_std": 0.10136513970792294, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6698993444442749, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 296.234375, "epoch": 0.45556640625, "grad_norm": 2.867780998149891, "kl": 0.0440673828125, "learning_rate": 8.861083984375e-07, "loss": 0.0018, "reward": 1.7205028533935547, "reward_std": 0.11783993989229202, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7205029428005219, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 283.390625, "epoch": 0.4560546875, "grad_norm": 3.335016088362262, "kl": 0.0521240234375, "learning_rate": 8.85986328125e-07, "loss": 0.0021, "reward": 1.6430580615997314, "reward_std": 0.1293087601661682, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6664955615997314, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 225.234375, "epoch": 0.45654296875, "grad_norm": 3.2388328242370283, "kl": 0.0621337890625, "learning_rate": 8.858642578124999e-07, "loss": 0.0025, "reward": 1.7366413474082947, "reward_std": 0.06143258325755596, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7366413474082947, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 286.390625, "epoch": 0.45703125, "grad_norm": 6.307742284245101, "kl": 0.044921875, "learning_rate": 8.857421874999999e-07, "loss": 0.0018, "reward": 1.8411588072776794, "reward_std": 0.09207257255911827, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8411588072776794, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 310.828125, "epoch": 0.45751953125, "grad_norm": 1.8548370757528454, "kl": 0.0482177734375, "learning_rate": 8.856201171874999e-07, "loss": 0.0019, "reward": 1.673714518547058, "reward_std": 0.12045683711767197, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6737145185470581, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 261.4453125, "epoch": 0.4580078125, "grad_norm": 2.9790179157150005, "kl": 0.0499267578125, "learning_rate": 8.85498046875e-07, "loss": 0.002, "reward": 1.758617639541626, "reward_std": 0.06485863775014877, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7586176097393036, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 364.4375, "epoch": 0.45849609375, "grad_norm": 2.1066399082293747, "kl": 0.0489501953125, "learning_rate": 8.853759765625e-07, "loss": 0.002, "reward": 1.6640775203704834, "reward_std": 0.08030284568667412, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6640775203704834, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 282.6640625, "epoch": 0.458984375, "grad_norm": 29.994085492419032, "kl": 0.0548095703125, "learning_rate": 8.8525390625e-07, "loss": 0.0022, "reward": 1.742246389389038, "reward_std": 0.11800673604011536, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7500588893890381, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 275.8203125, "epoch": 0.45947265625, "grad_norm": 2.0008366557399246, "kl": 0.057861328125, "learning_rate": 8.851318359375e-07, "loss": 0.0023, "reward": 1.8033297061920166, "reward_std": 0.15194324404001236, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8267672061920166, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 364.8125, "epoch": 0.4599609375, "grad_norm": 1.242202564610936, "kl": 0.04638671875, "learning_rate": 8.850097656249999e-07, "loss": 0.0019, "reward": 1.5236690640449524, "reward_std": 0.09883632883429527, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5471065491437912, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 411.1484375, "epoch": 0.46044921875, "grad_norm": 2.723081001967419, "kl": 0.0477294921875, "learning_rate": 8.848876953124999e-07, "loss": 0.0019, "reward": 1.7074534893035889, "reward_std": 0.08867547661066055, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7152659893035889, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 312.8203125, "epoch": 0.4609375, "grad_norm": 2.9268520095350783, "kl": 0.0587158203125, "learning_rate": 8.84765625e-07, "loss": 0.0023, "reward": 1.7078036665916443, "reward_std": 0.04931185767054558, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7078036367893219, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 294.390625, "epoch": 0.46142578125, "grad_norm": 2.101619140746079, "kl": 0.044921875, "learning_rate": 8.846435546875e-07, "loss": 0.0018, "reward": 1.7718039155006409, "reward_std": 0.051894426345825195, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7718039155006409, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 287.5546875, "epoch": 0.4619140625, "grad_norm": 0.8404176478040042, "kl": 0.0435791015625, "learning_rate": 8.84521484375e-07, "loss": 0.0017, "reward": 1.7640173435211182, "reward_std": 0.07772124605253339, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7718298435211182, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 160.078125, "epoch": 0.46240234375, "grad_norm": 1.1816393651871933, "kl": 0.058837890625, "learning_rate": 8.843994140625e-07, "loss": 0.0024, "reward": 1.8239731788635254, "reward_std": 0.0738510899245739, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8317857086658478, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 343.6796875, "epoch": 0.462890625, "grad_norm": 2.4437099072575146, "kl": 0.0467529296875, "learning_rate": 8.8427734375e-07, "loss": 0.0019, "reward": 1.8321685194969177, "reward_std": 0.06024608574807644, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8321685194969177, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 202.265625, "epoch": 0.46337890625, "grad_norm": 1.6390280058599591, "kl": 0.064697265625, "learning_rate": 8.841552734374999e-07, "loss": 0.0026, "reward": 1.795024573802948, "reward_std": 0.08007996901869774, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7950246036052704, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 268.3671875, "epoch": 0.4638671875, "grad_norm": 7.952462151127041, "kl": 0.0531005859375, "learning_rate": 8.840332031249999e-07, "loss": 0.0021, "reward": 1.6938014030456543, "reward_std": 0.0921289250254631, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6938014030456543, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 329.28125, "epoch": 0.46435546875, "grad_norm": 2.6547911721421067, "kl": 0.0528564453125, "learning_rate": 8.839111328125e-07, "loss": 0.0021, "reward": 1.7591851353645325, "reward_std": 0.14495818316936493, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7826226055622101, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 328.2265625, "epoch": 0.46484375, "grad_norm": 35.96771698306349, "kl": 0.1925048828125, "learning_rate": 8.837890625e-07, "loss": 0.0077, "reward": 1.7612931728363037, "reward_std": 0.1344320885837078, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7847306728363037, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 360.1015625, "epoch": 0.46533203125, "grad_norm": 15.788401904321928, "kl": 0.038330078125, "learning_rate": 8.836669921875e-07, "loss": 0.0015, "reward": 1.7165034413337708, "reward_std": 0.08753632940351963, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.724315881729126, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 360.203125, "epoch": 0.4658203125, "grad_norm": 1.8975472946536276, "kl": 0.0462646484375, "learning_rate": 8.83544921875e-07, "loss": 0.0018, "reward": 1.653282880783081, "reward_std": 0.14220409467816353, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.668907880783081, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 330.84375, "epoch": 0.46630859375, "grad_norm": 0.9088916050596484, "kl": 0.0372314453125, "learning_rate": 8.834228515624999e-07, "loss": 0.0015, "reward": 1.8434149026870728, "reward_std": 0.028221886605024338, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8434148728847504, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 276.3203125, "epoch": 0.466796875, "grad_norm": 2.81285479890254, "kl": 0.060546875, "learning_rate": 8.833007812499999e-07, "loss": 0.0024, "reward": 1.6376739144325256, "reward_std": 0.12609807774424553, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6454864144325256, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 253.2421875, "epoch": 0.46728515625, "grad_norm": 0.7538661853407663, "kl": 0.0445556640625, "learning_rate": 8.831787109374999e-07, "loss": 0.0018, "reward": 1.6063887476921082, "reward_std": 0.09519611299037933, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6376387178897858, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 322.9453125, "epoch": 0.4677734375, "grad_norm": 1.6060344661519697, "kl": 0.0537109375, "learning_rate": 8.83056640625e-07, "loss": 0.0021, "reward": 1.572835922241211, "reward_std": 0.2594187408685684, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.6431483775377274, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 295.53125, "epoch": 0.46826171875, "grad_norm": 3.1560722562556003, "kl": 0.049072265625, "learning_rate": 8.829345703125e-07, "loss": 0.002, "reward": 1.6071619987487793, "reward_std": 0.10899307206273079, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6227870583534241, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 216.1015625, "epoch": 0.46875, "grad_norm": 2.778150778773273, "kl": 0.0599365234375, "learning_rate": 8.828125e-07, "loss": 0.0024, "reward": 1.7072476148605347, "reward_std": 0.03206057846546173, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7072476148605347, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 277.4453125, "epoch": 0.46923828125, "grad_norm": 2.505049331757069, "kl": 0.0565185546875, "learning_rate": 8.826904296875e-07, "loss": 0.0023, "reward": 1.7165246605873108, "reward_std": 0.11047841422259808, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7477746307849884, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 282.125, "epoch": 0.4697265625, "grad_norm": 2.3133113705274453, "kl": 0.058349609375, "learning_rate": 8.825683593749999e-07, "loss": 0.0023, "reward": 1.663506269454956, "reward_std": 0.14057481661438942, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6947563290596008, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 292.5078125, "epoch": 0.47021484375, "grad_norm": 1.694135344638883, "kl": 0.05712890625, "learning_rate": 8.824462890624999e-07, "loss": 0.0023, "reward": 1.8244240880012512, "reward_std": 0.05446392297744751, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8244240880012512, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 383.609375, "epoch": 0.470703125, "grad_norm": 2.6371365345290045, "kl": 0.0440673828125, "learning_rate": 8.8232421875e-07, "loss": 0.0018, "reward": 1.7416203618049622, "reward_std": 0.12130639143288136, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7650578618049622, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 379.6484375, "epoch": 0.47119140625, "grad_norm": 2.218591135498477, "kl": 0.04278564453125, "learning_rate": 8.822021484375e-07, "loss": 0.0017, "reward": 1.6755053400993347, "reward_std": 0.14992902055382729, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.7458178997039795, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 356.3359375, "epoch": 0.4716796875, "grad_norm": 1.303009090371895, "kl": 0.061279296875, "learning_rate": 8.82080078125e-07, "loss": 0.0025, "reward": 1.6562331914901733, "reward_std": 0.08586933836340904, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6718582212924957, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 257.40625, "epoch": 0.47216796875, "grad_norm": 9.095150959885645, "kl": 0.0653076171875, "learning_rate": 8.819580078125e-07, "loss": 0.0026, "reward": 1.7505657076835632, "reward_std": 0.04661328159272671, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.750565767288208, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 373.2890625, "epoch": 0.47265625, "grad_norm": 2.6818988160026658, "kl": 0.05078125, "learning_rate": 8.818359374999999e-07, "loss": 0.002, "reward": 1.7254713773727417, "reward_std": 0.2372339516878128, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7645338177680969, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 228.75, "epoch": 0.47314453125, "grad_norm": 2.2365369221179696, "kl": 0.062744140625, "learning_rate": 8.817138671874999e-07, "loss": 0.0025, "reward": 1.7773956656455994, "reward_std": 0.0699392519891262, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7773956060409546, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 298.4921875, "epoch": 0.4736328125, "grad_norm": 3.807273507290948, "kl": 0.0625, "learning_rate": 8.815917968749999e-07, "loss": 0.0025, "reward": 1.6723923683166504, "reward_std": 0.1734137311577797, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.680204838514328, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 313.5625, "epoch": 0.47412109375, "grad_norm": 0.8599955165133477, "kl": 0.0482177734375, "learning_rate": 8.814697265625e-07, "loss": 0.0019, "reward": 1.854150652885437, "reward_std": 0.06166762858629227, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.854150652885437, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 355.28125, "epoch": 0.474609375, "grad_norm": 4.899318801004212, "kl": 0.043701171875, "learning_rate": 8.8134765625e-07, "loss": 0.0017, "reward": 1.817629873752594, "reward_std": 0.09293503686785698, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8254423439502716, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 268.8203125, "epoch": 0.47509765625, "grad_norm": 4.869813713992535, "kl": 0.0604248046875, "learning_rate": 8.812255859375e-07, "loss": 0.0024, "reward": 1.7895490527153015, "reward_std": 0.06284810416400433, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7895489931106567, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 275.2265625, "epoch": 0.4755859375, "grad_norm": 2.71150507007993, "kl": 0.0546875, "learning_rate": 8.81103515625e-07, "loss": 0.0022, "reward": 1.8050071597099304, "reward_std": 0.11932638473808765, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8206321597099304, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 254.890625, "epoch": 0.47607421875, "grad_norm": 2.495906498649002, "kl": 0.0635986328125, "learning_rate": 8.809814453124999e-07, "loss": 0.0025, "reward": 1.6766229271888733, "reward_std": 0.06276751309633255, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6766228377819061, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 299.2890625, "epoch": 0.4765625, "grad_norm": 2.0928693525491595, "kl": 0.048828125, "learning_rate": 8.808593749999999e-07, "loss": 0.002, "reward": 1.6983801126480103, "reward_std": 0.11064053699374199, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7140050232410431, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 276.6796875, "epoch": 0.47705078125, "grad_norm": 1.9579863122715575, "kl": 0.0552978515625, "learning_rate": 8.807373046875e-07, "loss": 0.0022, "reward": 1.6679657697677612, "reward_std": 0.05069480650126934, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6679657995700836, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 369.921875, "epoch": 0.4775390625, "grad_norm": 1.0533835681539812, "kl": 0.0489501953125, "learning_rate": 8.80615234375e-07, "loss": 0.002, "reward": 1.7244895100593567, "reward_std": 0.11370780691504478, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7401144802570343, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 395.9609375, "epoch": 0.47802734375, "grad_norm": 0.9675050638605003, "kl": 0.038330078125, "learning_rate": 8.804931640625e-07, "loss": 0.0015, "reward": 1.7015687227249146, "reward_std": 0.1075466200709343, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7171937227249146, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 289.4140625, "epoch": 0.478515625, "grad_norm": 1.5989887143012946, "kl": 0.0556640625, "learning_rate": 8.8037109375e-07, "loss": 0.0022, "reward": 1.647861123085022, "reward_std": 0.08899911493062973, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.647861123085022, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 366.03125, "epoch": 0.47900390625, "grad_norm": 3.564295998020266, "kl": 0.047119140625, "learning_rate": 8.802490234374999e-07, "loss": 0.0019, "reward": 1.699999213218689, "reward_std": 0.20631136745214462, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.723436713218689, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 232.4765625, "epoch": 0.4794921875, "grad_norm": 16.58975911359156, "kl": 0.0689697265625, "learning_rate": 8.801269531249999e-07, "loss": 0.0028, "reward": 1.6581519842147827, "reward_std": 0.06756994873285294, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6581519246101379, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 233.953125, "epoch": 0.47998046875, "grad_norm": 2.7300942691577283, "kl": 0.05859375, "learning_rate": 8.800048828124999e-07, "loss": 0.0023, "reward": 1.7354393601417542, "reward_std": 0.050481900572776794, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7354393303394318, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 281.8671875, "epoch": 0.48046875, "grad_norm": 4.98249531747594, "kl": 0.0531005859375, "learning_rate": 8.798828125e-07, "loss": 0.0021, "reward": 1.7789223194122314, "reward_std": 0.10021020472049713, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7789223790168762, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 318.734375, "epoch": 0.48095703125, "grad_norm": 2.6662511183569135, "kl": 0.0576171875, "learning_rate": 8.797607421875e-07, "loss": 0.0023, "reward": 1.7876138091087341, "reward_std": 0.09649738110601902, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7954262793064117, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 387.53125, "epoch": 0.4814453125, "grad_norm": 1.910422280476843, "kl": 0.0419921875, "learning_rate": 8.79638671875e-07, "loss": 0.0017, "reward": 1.7141448259353638, "reward_std": 0.129779651761055, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7297699153423309, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 279.3359375, "epoch": 0.48193359375, "grad_norm": 4.7292076134081364, "kl": 0.058837890625, "learning_rate": 8.795166015625e-07, "loss": 0.0023, "reward": 1.5576480627059937, "reward_std": 0.08005259186029434, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5576481074094772, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 304.90625, "epoch": 0.482421875, "grad_norm": 88.09217055784015, "kl": 0.0555419921875, "learning_rate": 8.793945312499999e-07, "loss": 0.0022, "reward": 1.7375428676605225, "reward_std": 0.055084478110075, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7375428080558777, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 411.5390625, "epoch": 0.48291015625, "grad_norm": 2.034132921180751, "kl": 0.04638671875, "learning_rate": 8.792724609374999e-07, "loss": 0.0019, "reward": 1.5709097981452942, "reward_std": 0.16416695713996887, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.602159857749939, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 309.984375, "epoch": 0.4833984375, "grad_norm": 2.0802002056866566, "kl": 0.0771484375, "learning_rate": 8.79150390625e-07, "loss": 0.0031, "reward": 1.762831211090088, "reward_std": 0.13553397357463837, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7862686514854431, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 277.3203125, "epoch": 0.48388671875, "grad_norm": 2.7772153375663637, "kl": 0.091552734375, "learning_rate": 8.790283203125e-07, "loss": 0.0037, "reward": 1.6474461555480957, "reward_std": 0.05931936576962471, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6474461555480957, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 330.5703125, "epoch": 0.484375, "grad_norm": 6.876528890436644, "kl": 0.072265625, "learning_rate": 8.7890625e-07, "loss": 0.0029, "reward": 1.698776364326477, "reward_std": 0.12324061989784241, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.714401364326477, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 360.3359375, "epoch": 0.48486328125, "grad_norm": 1.9851363650800362, "kl": 0.0592041015625, "learning_rate": 8.787841796875e-07, "loss": 0.0024, "reward": 1.696526050567627, "reward_std": 0.10686031728982925, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7199635207653046, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 285.546875, "epoch": 0.4853515625, "grad_norm": 0.6005806822321215, "kl": 0.039794921875, "learning_rate": 8.786621093749999e-07, "loss": 0.0016, "reward": 1.7581510543823242, "reward_std": 0.02178693562746048, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7581509947776794, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 313.90625, "epoch": 0.48583984375, "grad_norm": 1.2977429398294698, "kl": 0.0615234375, "learning_rate": 8.785400390624999e-07, "loss": 0.0025, "reward": 1.6917137503623962, "reward_std": 0.0859937984496355, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6995262205600739, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 367.3984375, "epoch": 0.486328125, "grad_norm": 5.4475694378610235, "kl": 0.044921875, "learning_rate": 8.784179687499999e-07, "loss": 0.0018, "reward": 1.7169365882873535, "reward_std": 0.12018711119890213, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7325615584850311, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 345.0703125, "epoch": 0.48681640625, "grad_norm": 1.8640936228979166, "kl": 0.0543212890625, "learning_rate": 8.782958984375e-07, "loss": 0.0022, "reward": 1.7613821029663086, "reward_std": 0.09000418707728386, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7613821029663086, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 311.7734375, "epoch": 0.4873046875, "grad_norm": 3.3218342595257826, "kl": 0.064453125, "learning_rate": 8.78173828125e-07, "loss": 0.0026, "reward": 1.7463974952697754, "reward_std": 0.06820238195359707, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7463975548744202, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 298.9765625, "epoch": 0.48779296875, "grad_norm": 2.845066128105519, "kl": 0.045654296875, "learning_rate": 8.780517578125e-07, "loss": 0.0018, "reward": 1.7427108883857727, "reward_std": 0.05628257617354393, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7427108585834503, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 322.265625, "epoch": 0.48828125, "grad_norm": 1.9373093856251078, "kl": 0.0430908203125, "learning_rate": 8.779296875e-07, "loss": 0.0017, "reward": 1.673474371433258, "reward_std": 0.06652860343456268, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6734744310379028, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 292.2109375, "epoch": 0.48876953125, "grad_norm": 1.0767612289001378, "kl": 0.05712890625, "learning_rate": 8.778076171874999e-07, "loss": 0.0023, "reward": 1.7587948441505432, "reward_std": 0.07515900582075119, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7666072845458984, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 345.5625, "epoch": 0.4892578125, "grad_norm": 3.2104609137717213, "kl": 0.0506591796875, "learning_rate": 8.776855468749999e-07, "loss": 0.002, "reward": 1.6220356822013855, "reward_std": 0.17616816610097885, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6376607120037079, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 325.5078125, "epoch": 0.48974609375, "grad_norm": 3.146050014446822, "kl": 0.0667724609375, "learning_rate": 8.775634765625e-07, "loss": 0.0027, "reward": 1.7525697350502014, "reward_std": 0.05434555187821388, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.752569705247879, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 256.6484375, "epoch": 0.490234375, "grad_norm": 1.5821218198575908, "kl": 0.059814453125, "learning_rate": 8.7744140625e-07, "loss": 0.0024, "reward": 1.7353711128234863, "reward_std": 0.10459060035645962, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7353711128234863, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 181.5703125, "epoch": 0.49072265625, "grad_norm": 2.4207497788820755, "kl": 0.076171875, "learning_rate": 8.773193359375e-07, "loss": 0.0031, "reward": 1.7326418161392212, "reward_std": 0.1155674196779728, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7326418459415436, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 298.421875, "epoch": 0.4912109375, "grad_norm": 1.288491196574845, "kl": 0.0489501953125, "learning_rate": 8.77197265625e-07, "loss": 0.002, "reward": 1.7518397569656372, "reward_std": 0.12005745619535446, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7674647867679596, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 294.6015625, "epoch": 0.49169921875, "grad_norm": 2.17017216108325, "kl": 0.05712890625, "learning_rate": 8.770751953124999e-07, "loss": 0.0023, "reward": 1.7059745788574219, "reward_std": 0.09743463061749935, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7137870192527771, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 363.2578125, "epoch": 0.4921875, "grad_norm": 2.2836236753098285, "kl": 0.04638671875, "learning_rate": 8.769531249999999e-07, "loss": 0.0019, "reward": 1.8077695965766907, "reward_std": 0.07913680747151375, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8077695965766907, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 326.1796875, "epoch": 0.49267578125, "grad_norm": 2.175372559341028, "kl": 0.057861328125, "learning_rate": 8.768310546874999e-07, "loss": 0.0023, "reward": 1.5813584327697754, "reward_std": 0.1646919883787632, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.620420902967453, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 277.109375, "epoch": 0.4931640625, "grad_norm": 4.3957866299990584, "kl": 0.0538330078125, "learning_rate": 8.76708984375e-07, "loss": 0.0022, "reward": 1.8353816866874695, "reward_std": 0.08745286241173744, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8353817164897919, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 340.359375, "epoch": 0.49365234375, "grad_norm": 2.6582112897234027, "kl": 0.049560546875, "learning_rate": 8.765869140625e-07, "loss": 0.002, "reward": 1.784598708152771, "reward_std": 0.0864131823182106, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7924111783504486, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 350.03125, "epoch": 0.494140625, "grad_norm": 1.71535782271039, "kl": 0.04443359375, "learning_rate": 8.7646484375e-07, "loss": 0.0018, "reward": 1.7793409824371338, "reward_std": 0.08340132981538773, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7949659824371338, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 241.046875, "epoch": 0.49462890625, "grad_norm": 1.5058958043333488, "kl": 0.0576171875, "learning_rate": 8.763427734375e-07, "loss": 0.0023, "reward": 1.7034955024719238, "reward_std": 0.054896529763936996, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7034954726696014, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 358.375, "epoch": 0.4951171875, "grad_norm": 2.2521914639437264, "kl": 0.041748046875, "learning_rate": 8.762207031249999e-07, "loss": 0.0017, "reward": 1.7619941234588623, "reward_std": 0.05280686542391777, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7619940936565399, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 342.5078125, "epoch": 0.49560546875, "grad_norm": 2.6329502608180997, "kl": 0.046630859375, "learning_rate": 8.760986328124999e-07, "loss": 0.0019, "reward": 1.652110517024994, "reward_std": 0.0947786420583725, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6833605170249939, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 340.859375, "epoch": 0.49609375, "grad_norm": 9.694014679211683, "kl": 0.0516357421875, "learning_rate": 8.759765625e-07, "loss": 0.0021, "reward": 1.6479641199111938, "reward_std": 0.15935315564274788, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6870265901088715, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 360.9921875, "epoch": 0.49658203125, "grad_norm": 3.105492193769874, "kl": 0.0606689453125, "learning_rate": 8.758544921875e-07, "loss": 0.0024, "reward": 1.6694360971450806, "reward_std": 0.15797552838921547, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6928735375404358, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 289.890625, "epoch": 0.4970703125, "grad_norm": 1.937880245962877, "kl": 0.06005859375, "learning_rate": 8.75732421875e-07, "loss": 0.0024, "reward": 1.6215600371360779, "reward_std": 0.17177317291498184, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6293725073337555, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 269.0703125, "epoch": 0.49755859375, "grad_norm": 14.494394764668058, "kl": 0.0552978515625, "learning_rate": 8.756103515625e-07, "loss": 0.0022, "reward": 1.6862713098526, "reward_std": 0.08157765120267868, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6862713694572449, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 271.6171875, "epoch": 0.498046875, "grad_norm": 1.0520254258307151, "kl": 0.0523681640625, "learning_rate": 8.754882812499999e-07, "loss": 0.0021, "reward": 1.700273334980011, "reward_std": 0.10829027369618416, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7158983647823334, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 298.84375, "epoch": 0.49853515625, "grad_norm": 4.450345699769664, "kl": 0.0531005859375, "learning_rate": 8.753662109374999e-07, "loss": 0.0021, "reward": 1.7399001717567444, "reward_std": 0.07643388211727142, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7477126717567444, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 254.078125, "epoch": 0.4990234375, "grad_norm": 1.7817445597680743, "kl": 0.0594482421875, "learning_rate": 8.752441406249999e-07, "loss": 0.0024, "reward": 1.7631664872169495, "reward_std": 0.10064388811588287, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7787915170192719, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 347.2890625, "epoch": 0.49951171875, "grad_norm": 0.7791878426138603, "kl": 0.0509033203125, "learning_rate": 8.751220703125e-07, "loss": 0.002, "reward": 1.8146781921386719, "reward_std": 0.040183124132454395, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8146781921386719, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 288.7734375, "epoch": 0.5, "grad_norm": 7.200748288611341, "kl": 0.0555419921875, "learning_rate": 8.75e-07, "loss": 0.0022, "reward": 1.6464455127716064, "reward_std": 0.11078909412026405, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6620705127716064, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 311.5625, "epoch": 0.50048828125, "grad_norm": 1.3967289593260515, "kl": 0.0596923828125, "learning_rate": 8.748779296875e-07, "loss": 0.0024, "reward": 1.7132092714309692, "reward_std": 0.06034594029188156, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7132093012332916, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 268.6328125, "epoch": 0.5009765625, "grad_norm": 1.143532694929036, "kl": 0.0494384765625, "learning_rate": 8.74755859375e-07, "loss": 0.002, "reward": 1.7486969828605652, "reward_std": 0.08863399224355817, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7643219530582428, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 356.1875, "epoch": 0.50146484375, "grad_norm": 2.946621458474271, "kl": 0.0443115234375, "learning_rate": 8.746337890624999e-07, "loss": 0.0018, "reward": 1.7483346462249756, "reward_std": 0.08586347103118896, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.756147176027298, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 361.0859375, "epoch": 0.501953125, "grad_norm": 3.4713510932561054, "kl": 0.05322265625, "learning_rate": 8.745117187499999e-07, "loss": 0.0021, "reward": 1.6632013320922852, "reward_std": 0.17927244305610657, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6866387724876404, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 397.5546875, "epoch": 0.50244140625, "grad_norm": 1.6093239601152483, "kl": 0.0498046875, "learning_rate": 8.743896484375e-07, "loss": 0.002, "reward": 1.6798649430274963, "reward_std": 0.12823793105781078, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7111149281263351, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 336.953125, "epoch": 0.5029296875, "grad_norm": 2.576957508996577, "kl": 0.063720703125, "learning_rate": 8.74267578125e-07, "loss": 0.0025, "reward": 1.7268919944763184, "reward_std": 0.11807430163025856, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7347044944763184, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 304.0625, "epoch": 0.50341796875, "grad_norm": 1.3370534115868566, "kl": 0.051025390625, "learning_rate": 8.741455078125e-07, "loss": 0.002, "reward": 1.7975549697875977, "reward_std": 0.048098307102918625, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7975549101829529, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 304.7265625, "epoch": 0.50390625, "grad_norm": 1.3836237431764185, "kl": 0.0504150390625, "learning_rate": 8.740234375e-07, "loss": 0.002, "reward": 1.634350836277008, "reward_std": 0.098635109141469, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6421633064746857, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 396.8125, "epoch": 0.50439453125, "grad_norm": 2.5285386551064795, "kl": 0.0504150390625, "learning_rate": 8.739013671874999e-07, "loss": 0.002, "reward": 1.6886343359947205, "reward_std": 0.1606372781097889, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7120718657970428, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 382.265625, "epoch": 0.5048828125, "grad_norm": 3.1872891481166574, "kl": 0.0452880859375, "learning_rate": 8.737792968749999e-07, "loss": 0.0018, "reward": 1.6827728748321533, "reward_std": 0.12052744254469872, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6827729046344757, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 276.5234375, "epoch": 0.50537109375, "grad_norm": 1.4142083245543213, "kl": 0.0538330078125, "learning_rate": 8.736572265624999e-07, "loss": 0.0022, "reward": 1.706916630268097, "reward_std": 0.13945464044809341, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7225416600704193, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 307.6953125, "epoch": 0.505859375, "grad_norm": 1.1813868455912004, "kl": 0.0445556640625, "learning_rate": 8.7353515625e-07, "loss": 0.0018, "reward": 1.8250656127929688, "reward_std": 0.059078922495245934, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8250656723976135, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 241.765625, "epoch": 0.50634765625, "grad_norm": 1.9538556939291407, "kl": 0.0528564453125, "learning_rate": 8.734130859375e-07, "loss": 0.0021, "reward": 1.6294710636138916, "reward_std": 0.12247138097882271, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6294711232185364, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 342.9296875, "epoch": 0.5068359375, "grad_norm": 1.4987364875717888, "kl": 0.042236328125, "learning_rate": 8.73291015625e-07, "loss": 0.0017, "reward": 1.797271490097046, "reward_std": 0.05235948599874973, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7972714900970459, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 335.0, "epoch": 0.50732421875, "grad_norm": 2.833930667181314, "kl": 0.04833984375, "learning_rate": 8.731689453125e-07, "loss": 0.0019, "reward": 1.7178579568862915, "reward_std": 0.09247782826423645, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7256704568862915, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 264.8203125, "epoch": 0.5078125, "grad_norm": 1.467240282257085, "kl": 0.0572509765625, "learning_rate": 8.730468749999999e-07, "loss": 0.0023, "reward": 1.7496626377105713, "reward_std": 0.02964417589828372, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7496626377105713, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 263.703125, "epoch": 0.50830078125, "grad_norm": 1.108726705203492, "kl": 0.052001953125, "learning_rate": 8.729248046874999e-07, "loss": 0.0021, "reward": 1.7599297761917114, "reward_std": 0.04675179207697511, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.759929746389389, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 238.34375, "epoch": 0.5087890625, "grad_norm": 1.5788272962847125, "kl": 0.055908203125, "learning_rate": 8.72802734375e-07, "loss": 0.0022, "reward": 1.8183218836784363, "reward_std": 0.04584968835115433, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.818321943283081, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 311.1953125, "epoch": 0.50927734375, "grad_norm": 4.386222680431177, "kl": 0.074951171875, "learning_rate": 8.726806640625e-07, "loss": 0.003, "reward": 1.7009983658790588, "reward_std": 0.1038425974547863, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7009983360767365, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 330.171875, "epoch": 0.509765625, "grad_norm": 2.3821930491899934, "kl": 0.0462646484375, "learning_rate": 8.7255859375e-07, "loss": 0.0018, "reward": 1.7736052870750427, "reward_std": 0.10071777179837227, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.781417727470398, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 340.5078125, "epoch": 0.51025390625, "grad_norm": 1.549376759535485, "kl": 0.052734375, "learning_rate": 8.724365234375e-07, "loss": 0.0021, "reward": 1.7131693959236145, "reward_std": 0.07255561649799347, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7131693959236145, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 320.6015625, "epoch": 0.5107421875, "grad_norm": 6.712788585227676, "kl": 0.1573486328125, "learning_rate": 8.72314453125e-07, "loss": 0.0063, "reward": 1.6211495399475098, "reward_std": 0.13179854303598404, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6602120995521545, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 308.1171875, "epoch": 0.51123046875, "grad_norm": 3.0574803857364286, "kl": 0.04736328125, "learning_rate": 8.721923828124999e-07, "loss": 0.0019, "reward": 1.7683227062225342, "reward_std": 0.08765990659594536, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7761352360248566, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 257.8046875, "epoch": 0.51171875, "grad_norm": 2.023235694735734, "kl": 0.065185546875, "learning_rate": 8.720703124999999e-07, "loss": 0.0026, "reward": 1.6004237532615662, "reward_std": 0.05043849162757397, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6004238128662109, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 279.609375, "epoch": 0.51220703125, "grad_norm": 1.880850379976763, "kl": 0.058837890625, "learning_rate": 8.719482421875e-07, "loss": 0.0024, "reward": 1.828034520149231, "reward_std": 0.05495187267661095, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.828034520149231, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 290.7421875, "epoch": 0.5126953125, "grad_norm": 1.737728691407687, "kl": 0.05517578125, "learning_rate": 8.71826171875e-07, "loss": 0.0022, "reward": 1.560300588607788, "reward_std": 0.07925301790237427, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5681131184101105, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 270.6328125, "epoch": 0.51318359375, "grad_norm": 2.886774795094905, "kl": 0.0634765625, "learning_rate": 8.717041015625e-07, "loss": 0.0025, "reward": 1.6774318218231201, "reward_std": 0.06801902502775192, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6852443218231201, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 409.0859375, "epoch": 0.513671875, "grad_norm": 1.0122422402580602, "kl": 0.0460205078125, "learning_rate": 8.7158203125e-07, "loss": 0.0018, "reward": 1.7169759273529053, "reward_std": 0.05733257718384266, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7169758975505829, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 293.25, "epoch": 0.51416015625, "grad_norm": 1.0407203824158495, "kl": 0.0599365234375, "learning_rate": 8.714599609374999e-07, "loss": 0.0024, "reward": 1.7637624740600586, "reward_std": 0.09369587153196335, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7793874442577362, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 250.25, "epoch": 0.5146484375, "grad_norm": 44.02991363992667, "kl": 0.0645751953125, "learning_rate": 8.713378906249999e-07, "loss": 0.0026, "reward": 1.6559126377105713, "reward_std": 0.057641902938485146, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6559126079082489, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 366.546875, "epoch": 0.51513671875, "grad_norm": 4.093271765716937, "kl": 0.055908203125, "learning_rate": 8.712158203124999e-07, "loss": 0.0022, "reward": 1.6613022685050964, "reward_std": 0.11586426943540573, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6769272983074188, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 345.09375, "epoch": 0.515625, "grad_norm": 3.3041055571935196, "kl": 0.046875, "learning_rate": 8.7109375e-07, "loss": 0.0019, "reward": 1.7097191214561462, "reward_std": 0.09788389131426811, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7253441214561462, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 334.1953125, "epoch": 0.51611328125, "grad_norm": 1.380486793481445, "kl": 0.053466796875, "learning_rate": 8.709716796875e-07, "loss": 0.0021, "reward": 1.7574412822723389, "reward_std": 0.058644311502575874, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7574412822723389, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 289.3515625, "epoch": 0.5166015625, "grad_norm": 5.591919284669544, "kl": 0.060302734375, "learning_rate": 8.70849609375e-07, "loss": 0.0024, "reward": 1.8293917179107666, "reward_std": 0.09176983684301376, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8293918073177338, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 229.9140625, "epoch": 0.51708984375, "grad_norm": 1.2725887032541328, "kl": 0.066650390625, "learning_rate": 8.707275390625e-07, "loss": 0.0027, "reward": 1.706933856010437, "reward_std": 0.07794651389122009, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7069338262081146, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 200.875, "epoch": 0.517578125, "grad_norm": 3.145191103123939, "kl": 0.077392578125, "learning_rate": 8.706054687499999e-07, "loss": 0.0031, "reward": 1.6468673944473267, "reward_std": 0.06843332573771477, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6468673646450043, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 303.234375, "epoch": 0.51806640625, "grad_norm": 2.5048711249337625, "kl": 0.057861328125, "learning_rate": 8.704833984374999e-07, "loss": 0.0023, "reward": 1.7483791708946228, "reward_std": 0.10295334830880165, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7483791410923004, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 228.453125, "epoch": 0.5185546875, "grad_norm": 3.1332272679114657, "kl": 0.0693359375, "learning_rate": 8.70361328125e-07, "loss": 0.0028, "reward": 1.7026050090789795, "reward_std": 0.07863837853074074, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7104175388813019, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 228.25, "epoch": 0.51904296875, "grad_norm": 2.9349932273222374, "kl": 0.0595703125, "learning_rate": 8.702392578125e-07, "loss": 0.0024, "reward": 1.8661960363388062, "reward_std": 0.03855743817985058, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8661959767341614, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 343.734375, "epoch": 0.51953125, "grad_norm": 4.005208537757595, "kl": 0.0533447265625, "learning_rate": 8.701171875e-07, "loss": 0.0021, "reward": 1.6547017097473145, "reward_std": 0.08106643706560135, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6547016203403473, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 277.265625, "epoch": 0.52001953125, "grad_norm": 14.385556488276366, "kl": 0.056396484375, "learning_rate": 8.699951171875e-07, "loss": 0.0023, "reward": 1.7611234784126282, "reward_std": 0.06275673396885395, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7611234188079834, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 242.8203125, "epoch": 0.5205078125, "grad_norm": 2.572037094829945, "kl": 0.055908203125, "learning_rate": 8.698730468749999e-07, "loss": 0.0022, "reward": 1.8109251260757446, "reward_std": 0.0447577740997076, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8109250664710999, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 280.0703125, "epoch": 0.52099609375, "grad_norm": 1.5312642359752389, "kl": 0.054443359375, "learning_rate": 8.697509765624999e-07, "loss": 0.0022, "reward": 1.7536611557006836, "reward_std": 0.06718228757381439, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7536611258983612, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 301.5703125, "epoch": 0.521484375, "grad_norm": 1.3523042468821218, "kl": 0.055419921875, "learning_rate": 8.696289062499999e-07, "loss": 0.0022, "reward": 1.644126534461975, "reward_std": 0.06054982542991638, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6441265642642975, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 271.2734375, "epoch": 0.52197265625, "grad_norm": 1.9861791425441333, "kl": 0.0662841796875, "learning_rate": 8.695068359375e-07, "loss": 0.0027, "reward": 1.6830092668533325, "reward_std": 0.0935671292245388, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6830093264579773, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 355.7890625, "epoch": 0.5224609375, "grad_norm": 7.973684507916818, "kl": 0.0648193359375, "learning_rate": 8.69384765625e-07, "loss": 0.0026, "reward": 1.6281793117523193, "reward_std": 0.15630166232585907, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6359919011592865, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 355.984375, "epoch": 0.52294921875, "grad_norm": 2.766158319030079, "kl": 0.0518798828125, "learning_rate": 8.692626953125e-07, "loss": 0.0021, "reward": 1.7701700925827026, "reward_std": 0.22449339926242828, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.8170450925827026, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 327.3203125, "epoch": 0.5234375, "grad_norm": 2.20414905944699, "kl": 0.0506591796875, "learning_rate": 8.69140625e-07, "loss": 0.002, "reward": 1.783662736415863, "reward_std": 0.07873168960213661, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7914752662181854, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 334.203125, "epoch": 0.52392578125, "grad_norm": 4.143705460739188, "kl": 0.0576171875, "learning_rate": 8.690185546874999e-07, "loss": 0.0023, "reward": 1.6684030294418335, "reward_std": 0.07677320018410683, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6684030294418335, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 350.8203125, "epoch": 0.5244140625, "grad_norm": 2.2380443410195157, "kl": 0.062255859375, "learning_rate": 8.688964843749999e-07, "loss": 0.0025, "reward": 1.8161095976829529, "reward_std": 0.054756371304392815, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8161095678806305, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 292.46875, "epoch": 0.52490234375, "grad_norm": 2.096163231473889, "kl": 0.06787109375, "learning_rate": 8.687744140625e-07, "loss": 0.0027, "reward": 1.8008560538291931, "reward_std": 0.09001643769443035, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8164810538291931, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 367.9375, "epoch": 0.525390625, "grad_norm": 9.937730648389389, "kl": 0.0526123046875, "learning_rate": 8.6865234375e-07, "loss": 0.0021, "reward": 1.7251918315887451, "reward_std": 0.11758009344339371, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7486292719841003, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 298.7421875, "epoch": 0.52587890625, "grad_norm": 9.17932500787084, "kl": 0.0606689453125, "learning_rate": 8.685302734375e-07, "loss": 0.0024, "reward": 1.7291359901428223, "reward_std": 0.07484306022524834, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.729136049747467, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 273.453125, "epoch": 0.5263671875, "grad_norm": 1.9438836015934406, "kl": 0.058349609375, "learning_rate": 8.68408203125e-07, "loss": 0.0023, "reward": 1.6683382987976074, "reward_std": 0.07380038499832153, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6683382987976074, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 274.3984375, "epoch": 0.52685546875, "grad_norm": 5.847630596731743, "kl": 0.07080078125, "learning_rate": 8.682861328124999e-07, "loss": 0.0028, "reward": 1.7424799799919128, "reward_std": 0.04051386937499046, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7424799799919128, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 216.6328125, "epoch": 0.52734375, "grad_norm": 4.09131971663552, "kl": 0.060302734375, "learning_rate": 8.681640624999999e-07, "loss": 0.0024, "reward": 1.725938320159912, "reward_std": 0.07212316989898682, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7259383201599121, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 257.328125, "epoch": 0.52783203125, "grad_norm": 3.5388236065556327, "kl": 0.0601806640625, "learning_rate": 8.680419921874999e-07, "loss": 0.0024, "reward": 1.73984694480896, "reward_std": 0.07245020382106304, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7398469150066376, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 337.59375, "epoch": 0.5283203125, "grad_norm": 3.757069955864066, "kl": 0.0584716796875, "learning_rate": 8.67919921875e-07, "loss": 0.0023, "reward": 1.7384961247444153, "reward_std": 0.1039031371474266, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7384961247444153, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 359.2421875, "epoch": 0.52880859375, "grad_norm": 6.03160828595789, "kl": 0.0555419921875, "learning_rate": 8.677978515625e-07, "loss": 0.0022, "reward": 1.748001754283905, "reward_std": 0.06500514224171638, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.748001754283905, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 332.5625, "epoch": 0.529296875, "grad_norm": 2.655696243747116, "kl": 0.0491943359375, "learning_rate": 8.6767578125e-07, "loss": 0.002, "reward": 1.8072885274887085, "reward_std": 0.09110748954117298, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8307260870933533, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 326.0078125, "epoch": 0.52978515625, "grad_norm": 3.464007827161993, "kl": 0.054443359375, "learning_rate": 8.675537109375e-07, "loss": 0.0022, "reward": 1.8391188979148865, "reward_std": 0.06963248923420906, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8391189575195312, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 301.0, "epoch": 0.5302734375, "grad_norm": 5.126535621984238, "kl": 0.0596923828125, "learning_rate": 8.674316406249999e-07, "loss": 0.0024, "reward": 1.6497421264648438, "reward_std": 0.07221582159399986, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6497421860694885, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 301.15625, "epoch": 0.53076171875, "grad_norm": 1.2442388532768656, "kl": 0.055908203125, "learning_rate": 8.673095703124999e-07, "loss": 0.0022, "reward": 1.7321175932884216, "reward_std": 0.10106639470905066, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.739930123090744, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 271.734375, "epoch": 0.53125, "grad_norm": 2.995348861215367, "kl": 0.0584716796875, "learning_rate": 8.671875e-07, "loss": 0.0023, "reward": 1.7458354234695435, "reward_std": 0.037329770624637604, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7458354234695435, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 396.6796875, "epoch": 0.53173828125, "grad_norm": 1.167096873944655, "kl": 0.0460205078125, "learning_rate": 8.670654296875e-07, "loss": 0.0018, "reward": 1.7276933789253235, "reward_std": 0.136086568236351, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7589433491230011, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 288.84375, "epoch": 0.5322265625, "grad_norm": 2.968643105816318, "kl": 0.0660400390625, "learning_rate": 8.66943359375e-07, "loss": 0.0026, "reward": 1.7143974304199219, "reward_std": 0.11442429013550282, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7222099602222443, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 376.90625, "epoch": 0.53271484375, "grad_norm": 14.99214457614237, "kl": 0.0484619140625, "learning_rate": 8.668212890625e-07, "loss": 0.0019, "reward": 1.7458195090293884, "reward_std": 0.05967376381158829, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7458195388317108, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 316.953125, "epoch": 0.533203125, "grad_norm": 1.261439455958631, "kl": 0.0469970703125, "learning_rate": 8.666992187499999e-07, "loss": 0.0019, "reward": 1.7515225410461426, "reward_std": 0.05775933898985386, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7515226006507874, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 213.2890625, "epoch": 0.53369140625, "grad_norm": 2.9498487401383953, "kl": 0.068603515625, "learning_rate": 8.665771484374999e-07, "loss": 0.0027, "reward": 1.7203047275543213, "reward_std": 0.08913041837513447, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7203047573566437, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 309.765625, "epoch": 0.5341796875, "grad_norm": 7.1364746418285545, "kl": 0.0576171875, "learning_rate": 8.664550781249999e-07, "loss": 0.0023, "reward": 1.652907907962799, "reward_std": 0.13294285163283348, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6607204079627991, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 301.078125, "epoch": 0.53466796875, "grad_norm": 1.8817117462815578, "kl": 0.0555419921875, "learning_rate": 8.663330078125e-07, "loss": 0.0022, "reward": 1.721267819404602, "reward_std": 0.030888373032212257, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7212677896022797, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 354.53125, "epoch": 0.53515625, "grad_norm": 3.087003895884515, "kl": 0.0443115234375, "learning_rate": 8.662109375e-07, "loss": 0.0018, "reward": 1.7337496876716614, "reward_std": 0.08758010156452656, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7571871876716614, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 203.8515625, "epoch": 0.53564453125, "grad_norm": 1.6271168573011427, "kl": 0.054931640625, "learning_rate": 8.660888671875e-07, "loss": 0.0022, "reward": 1.7751588225364685, "reward_std": 0.03671477176249027, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7751587927341461, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 346.546875, "epoch": 0.5361328125, "grad_norm": 1.7985380156894293, "kl": 0.040283203125, "learning_rate": 8.65966796875e-07, "loss": 0.0016, "reward": 1.6898677945137024, "reward_std": 0.09911376610398293, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6976803243160248, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 317.21875, "epoch": 0.53662109375, "grad_norm": 1.7440656859976609, "kl": 0.050048828125, "learning_rate": 8.658447265624999e-07, "loss": 0.002, "reward": 1.7902414798736572, "reward_std": 0.07932448014616966, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7902414798736572, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 282.546875, "epoch": 0.537109375, "grad_norm": 0.8393827425684771, "kl": 0.0535888671875, "learning_rate": 8.657226562499999e-07, "loss": 0.0021, "reward": 1.746444582939148, "reward_std": 0.04779106751084328, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7464446127414703, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 465.546875, "epoch": 0.53759765625, "grad_norm": 2.5411055906974207, "kl": 0.0362548828125, "learning_rate": 8.656005859375e-07, "loss": 0.0014, "reward": 1.7083318829536438, "reward_std": 0.10737061686813831, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7239568829536438, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 391.9765625, "epoch": 0.5380859375, "grad_norm": 0.8904683187080978, "kl": 0.0406494140625, "learning_rate": 8.65478515625e-07, "loss": 0.0016, "reward": 1.6586171388626099, "reward_std": 0.1240275464951992, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6820546984672546, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 331.546875, "epoch": 0.53857421875, "grad_norm": 2.333691146894965, "kl": 0.043212890625, "learning_rate": 8.653564453125e-07, "loss": 0.0017, "reward": 1.6594500541687012, "reward_std": 0.07179497927427292, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6594500541687012, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 311.2890625, "epoch": 0.5390625, "grad_norm": 2.851343254553436, "kl": 0.0518798828125, "learning_rate": 8.65234375e-07, "loss": 0.0021, "reward": 1.8387314081192017, "reward_std": 0.09944453835487366, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8387314081192017, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 301.1796875, "epoch": 0.53955078125, "grad_norm": 1.9310397948323363, "kl": 0.04345703125, "learning_rate": 8.651123046874999e-07, "loss": 0.0017, "reward": 1.8558620810508728, "reward_std": 0.10813725739717484, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8714870512485504, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 374.3359375, "epoch": 0.5400390625, "grad_norm": 2.7147615717436575, "kl": 0.0428466796875, "learning_rate": 8.649902343749999e-07, "loss": 0.0017, "reward": 1.696751356124878, "reward_std": 0.13834229856729507, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7280014157295227, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 300.7734375, "epoch": 0.54052734375, "grad_norm": 1.3847207971513456, "kl": 0.0501708984375, "learning_rate": 8.648681640624999e-07, "loss": 0.002, "reward": 1.6468215584754944, "reward_std": 0.08769709430634975, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6624464988708496, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 360.703125, "epoch": 0.541015625, "grad_norm": 2.361071789324421, "kl": 0.0533447265625, "learning_rate": 8.6474609375e-07, "loss": 0.0021, "reward": 1.7735760807991028, "reward_std": 0.12644799798727036, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7970135807991028, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 315.15625, "epoch": 0.54150390625, "grad_norm": 2.3227503298020578, "kl": 0.0482177734375, "learning_rate": 8.646240234375e-07, "loss": 0.0019, "reward": 1.6930591464042664, "reward_std": 0.10155784152448177, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7164965569972992, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 214.6875, "epoch": 0.5419921875, "grad_norm": 3.0123794250756006, "kl": 0.0523681640625, "learning_rate": 8.64501953125e-07, "loss": 0.0021, "reward": 1.7508153915405273, "reward_std": 0.04635917954146862, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7508153319358826, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 465.0546875, "epoch": 0.54248046875, "grad_norm": 0.5261264496982065, "kl": 0.037109375, "learning_rate": 8.643798828125e-07, "loss": 0.0015, "reward": 1.5648311376571655, "reward_std": 0.14709511492401361, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6273311078548431, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 403.0625, "epoch": 0.54296875, "grad_norm": 1.5640578960440386, "kl": 0.0469970703125, "learning_rate": 8.642578124999999e-07, "loss": 0.0019, "reward": 1.8022651076316833, "reward_std": 0.04896317049860954, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8022651076316833, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 421.1171875, "epoch": 0.54345703125, "grad_norm": 2.2857686368129295, "kl": 0.042724609375, "learning_rate": 8.641357421874999e-07, "loss": 0.0017, "reward": 1.7114735841751099, "reward_std": 0.17102890089154243, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7505361139774323, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 307.453125, "epoch": 0.5439453125, "grad_norm": 2.4536320336668997, "kl": 0.0411376953125, "learning_rate": 8.64013671875e-07, "loss": 0.0016, "reward": 1.6378782987594604, "reward_std": 0.08712486177682877, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6691283285617828, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 345.21875, "epoch": 0.54443359375, "grad_norm": 1.739157825797545, "kl": 0.0460205078125, "learning_rate": 8.638916015625e-07, "loss": 0.0018, "reward": 1.755341649055481, "reward_std": 0.04750080406665802, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.755341649055481, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 349.7734375, "epoch": 0.544921875, "grad_norm": 2.4358654204024375, "kl": 0.0413818359375, "learning_rate": 8.6376953125e-07, "loss": 0.0017, "reward": 1.7550008893013, "reward_std": 0.04658900573849678, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7550008594989777, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 256.9375, "epoch": 0.54541015625, "grad_norm": 1.9727736242781981, "kl": 0.0411376953125, "learning_rate": 8.636474609375e-07, "loss": 0.0016, "reward": 1.8357142806053162, "reward_std": 0.043268971145153046, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8357143700122833, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 338.1484375, "epoch": 0.5458984375, "grad_norm": 1.4259508975204152, "kl": 0.0421142578125, "learning_rate": 8.635253906249999e-07, "loss": 0.0017, "reward": 1.7081193327903748, "reward_std": 0.04622589237987995, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7081193625926971, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 271.2109375, "epoch": 0.54638671875, "grad_norm": 21.16092755028298, "kl": 0.0484619140625, "learning_rate": 8.634033203124999e-07, "loss": 0.0019, "reward": 1.7775554656982422, "reward_std": 0.06883841939270496, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7775554060935974, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 223.1328125, "epoch": 0.546875, "grad_norm": 1.8238098647122996, "kl": 0.071044921875, "learning_rate": 8.632812499999999e-07, "loss": 0.0028, "reward": 1.782721757888794, "reward_std": 0.03212573006749153, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.782721757888794, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 293.3125, "epoch": 0.54736328125, "grad_norm": 1.316049654009593, "kl": 0.0457763671875, "learning_rate": 8.631591796875e-07, "loss": 0.0018, "reward": 1.8062964081764221, "reward_std": 0.053434714674949646, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8062964081764221, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 328.375, "epoch": 0.5478515625, "grad_norm": 2.177144782177925, "kl": 0.0462646484375, "learning_rate": 8.63037109375e-07, "loss": 0.0019, "reward": 1.6950576901435852, "reward_std": 0.17912092059850693, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7497451901435852, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 260.453125, "epoch": 0.54833984375, "grad_norm": 1.8973596678524318, "kl": 0.0445556640625, "learning_rate": 8.629150390625e-07, "loss": 0.0018, "reward": 1.6502639651298523, "reward_std": 0.042238444089889526, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6502639055252075, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 366.46875, "epoch": 0.548828125, "grad_norm": 1.620960824121817, "kl": 0.067138671875, "learning_rate": 8.6279296875e-07, "loss": 0.0027, "reward": 1.682121753692627, "reward_std": 0.18632768094539642, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.736809253692627, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 327.09375, "epoch": 0.54931640625, "grad_norm": 1.8774955491831953, "kl": 0.0504150390625, "learning_rate": 8.626708984374999e-07, "loss": 0.002, "reward": 1.661731779575348, "reward_std": 0.06446626409888268, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6617318093776703, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 318.8515625, "epoch": 0.5498046875, "grad_norm": 1.2653808730315304, "kl": 0.0457763671875, "learning_rate": 8.625488281249999e-07, "loss": 0.0018, "reward": 1.7855925559997559, "reward_std": 0.08860567212104797, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7855925559997559, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 208.34375, "epoch": 0.55029296875, "grad_norm": 2.4810925421537715, "kl": 0.05078125, "learning_rate": 8.624267578125e-07, "loss": 0.002, "reward": 1.8083539009094238, "reward_std": 0.07869580388069153, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8083539605140686, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 226.9453125, "epoch": 0.55078125, "grad_norm": 1.170883156742559, "kl": 0.052001953125, "learning_rate": 8.623046875e-07, "loss": 0.0021, "reward": 1.82011079788208, "reward_std": 0.025017164181917906, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8201107978820801, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 282.84375, "epoch": 0.55126953125, "grad_norm": 0.7270591612895813, "kl": 0.050048828125, "learning_rate": 8.621826171875e-07, "loss": 0.002, "reward": 1.7170042395591736, "reward_std": 0.0804044771939516, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7248167097568512, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 341.328125, "epoch": 0.5517578125, "grad_norm": 2.0226104624847783, "kl": 0.0504150390625, "learning_rate": 8.62060546875e-07, "loss": 0.002, "reward": 1.6977909207344055, "reward_std": 0.07109425030648708, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6977909505367279, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 286.484375, "epoch": 0.55224609375, "grad_norm": 1.0935418228016645, "kl": 0.046875, "learning_rate": 8.619384765625e-07, "loss": 0.0019, "reward": 1.7301682233810425, "reward_std": 0.0707546304911375, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7379806041717529, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 246.953125, "epoch": 0.552734375, "grad_norm": 2.088569309643272, "kl": 0.0562744140625, "learning_rate": 8.618164062499999e-07, "loss": 0.0022, "reward": 1.7825233340263367, "reward_std": 0.06269277073442936, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7825233638286591, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 306.640625, "epoch": 0.55322265625, "grad_norm": 1.6778708166777785, "kl": 0.0460205078125, "learning_rate": 8.616943359374999e-07, "loss": 0.0018, "reward": 1.6941341757774353, "reward_std": 0.03843311499804258, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6941341161727905, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 323.390625, "epoch": 0.5537109375, "grad_norm": 2.8756770323876784, "kl": 0.0501708984375, "learning_rate": 8.61572265625e-07, "loss": 0.002, "reward": 1.7191390991210938, "reward_std": 0.05277089774608612, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7191390693187714, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 277.8671875, "epoch": 0.55419921875, "grad_norm": 1.7093735784110484, "kl": 0.0504150390625, "learning_rate": 8.614501953125e-07, "loss": 0.002, "reward": 1.875854730606079, "reward_std": 0.03535257466137409, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8758547604084015, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 297.7109375, "epoch": 0.5546875, "grad_norm": 2.1214394895752355, "kl": 0.05419921875, "learning_rate": 8.61328125e-07, "loss": 0.0022, "reward": 1.693172812461853, "reward_std": 0.10498131066560745, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6931727230548859, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 307.921875, "epoch": 0.55517578125, "grad_norm": 2.1311745798315873, "kl": 0.048583984375, "learning_rate": 8.612060546875e-07, "loss": 0.0019, "reward": 1.6516863703727722, "reward_std": 0.06436803564429283, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6516863703727722, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 358.7890625, "epoch": 0.5556640625, "grad_norm": 2.5661871345952236, "kl": 0.0592041015625, "learning_rate": 8.610839843749999e-07, "loss": 0.0024, "reward": 1.5657876133918762, "reward_std": 0.1475287228822708, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5814126431941986, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 372.8984375, "epoch": 0.55615234375, "grad_norm": 1.7584895844002197, "kl": 0.0433349609375, "learning_rate": 8.609619140624999e-07, "loss": 0.0017, "reward": 1.763173222541809, "reward_std": 0.09580406174063683, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7787982225418091, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 328.6953125, "epoch": 0.556640625, "grad_norm": 1.5770899012379078, "kl": 0.0501708984375, "learning_rate": 8.6083984375e-07, "loss": 0.002, "reward": 1.785545527935028, "reward_std": 0.1521715521812439, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8089830279350281, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 342.4375, "epoch": 0.55712890625, "grad_norm": 2.0376418633131888, "kl": 0.0655517578125, "learning_rate": 8.607177734375e-07, "loss": 0.0026, "reward": 1.6794911623001099, "reward_std": 0.19039485603570938, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7029286623001099, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 206.203125, "epoch": 0.5576171875, "grad_norm": 4.0042765130772375, "kl": 0.0548095703125, "learning_rate": 8.60595703125e-07, "loss": 0.0022, "reward": 1.7438197135925293, "reward_std": 0.1175292357802391, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7516322731971741, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 305.703125, "epoch": 0.55810546875, "grad_norm": 1.8931830827167573, "kl": 0.04833984375, "learning_rate": 8.604736328125e-07, "loss": 0.0019, "reward": 1.6692324876785278, "reward_std": 0.10438600182533264, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6770449876785278, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 339.4140625, "epoch": 0.55859375, "grad_norm": 1.5525001439122468, "kl": 0.0489501953125, "learning_rate": 8.603515625e-07, "loss": 0.002, "reward": 1.722908079624176, "reward_std": 0.10220470279455185, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.738533079624176, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 284.34375, "epoch": 0.55908203125, "grad_norm": 1.3304876430870214, "kl": 0.051025390625, "learning_rate": 8.602294921874999e-07, "loss": 0.002, "reward": 1.792259931564331, "reward_std": 0.0483635775744915, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7922599613666534, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 420.015625, "epoch": 0.5595703125, "grad_norm": 3.74826680648072, "kl": 0.04736328125, "learning_rate": 8.601074218749999e-07, "loss": 0.0019, "reward": 1.6866852045059204, "reward_std": 0.0666123665869236, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.686685174703598, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 364.296875, "epoch": 0.56005859375, "grad_norm": 1.506824333758443, "kl": 0.0487060546875, "learning_rate": 8.599853515625e-07, "loss": 0.0019, "reward": 1.7012399435043335, "reward_std": 0.11691510677337646, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7246775031089783, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 306.3203125, "epoch": 0.560546875, "grad_norm": 1.229486203330246, "kl": 0.048583984375, "learning_rate": 8.5986328125e-07, "loss": 0.0019, "reward": 1.7387210130691528, "reward_std": 0.1014990508556366, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7465335130691528, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 259.984375, "epoch": 0.56103515625, "grad_norm": 2.293838234033414, "kl": 0.0511474609375, "learning_rate": 8.597412109375e-07, "loss": 0.002, "reward": 1.7903647422790527, "reward_std": 0.07567498832941055, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8138022422790527, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 368.15625, "epoch": 0.5615234375, "grad_norm": 11.895148102191571, "kl": 0.0526123046875, "learning_rate": 8.59619140625e-07, "loss": 0.0021, "reward": 1.7975794076919556, "reward_std": 0.09720364585518837, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7975793480873108, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 353.8828125, "epoch": 0.56201171875, "grad_norm": 2.3961074495667747, "kl": 0.0467529296875, "learning_rate": 8.594970703124999e-07, "loss": 0.0019, "reward": 1.6959925889968872, "reward_std": 0.1357579454779625, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7038050889968872, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 318.46875, "epoch": 0.5625, "grad_norm": 3.689726718036065, "kl": 0.0567626953125, "learning_rate": 8.593749999999999e-07, "loss": 0.0023, "reward": 1.7538256645202637, "reward_std": 0.07644342631101608, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7538256645202637, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 280.9921875, "epoch": 0.56298828125, "grad_norm": 2.075275971718928, "kl": 0.0587158203125, "learning_rate": 8.592529296874999e-07, "loss": 0.0023, "reward": 1.788736641407013, "reward_std": 0.047330291010439396, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7887366712093353, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 375.765625, "epoch": 0.5634765625, "grad_norm": 2.5741019637692033, "kl": 0.0557861328125, "learning_rate": 8.59130859375e-07, "loss": 0.0022, "reward": 1.5938833951950073, "reward_std": 0.11518048122525215, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6251333653926849, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 340.2734375, "epoch": 0.56396484375, "grad_norm": 2.3696482124699134, "kl": 0.048095703125, "learning_rate": 8.590087890625e-07, "loss": 0.0019, "reward": 1.690087914466858, "reward_std": 0.106233149766922, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6979004740715027, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 347.296875, "epoch": 0.564453125, "grad_norm": 2.4027424722018864, "kl": 0.041015625, "learning_rate": 8.5888671875e-07, "loss": 0.0016, "reward": 1.812968671321869, "reward_std": 0.044363994151353836, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8129686117172241, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 286.6484375, "epoch": 0.56494140625, "grad_norm": 2.829388054231261, "kl": 0.052001953125, "learning_rate": 8.587646484375e-07, "loss": 0.0021, "reward": 1.6969123482704163, "reward_std": 0.12178021855652332, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7359748482704163, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 225.4375, "epoch": 0.5654296875, "grad_norm": 19.7758891461679, "kl": 0.052734375, "learning_rate": 8.586425781249999e-07, "loss": 0.0021, "reward": 1.7072020769119263, "reward_std": 0.0726108830422163, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.722827136516571, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 340.9296875, "epoch": 0.56591796875, "grad_norm": 8.959080922512666, "kl": 0.05517578125, "learning_rate": 8.585205078124999e-07, "loss": 0.0022, "reward": 1.684401273727417, "reward_std": 0.08838875964283943, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.684401273727417, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 417.0859375, "epoch": 0.56640625, "grad_norm": 2.362357041373118, "kl": 0.039306640625, "learning_rate": 8.583984375e-07, "loss": 0.0016, "reward": 1.6522272229194641, "reward_std": 0.11733454465866089, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6834772229194641, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 280.859375, "epoch": 0.56689453125, "grad_norm": 11.949152567538816, "kl": 0.046630859375, "learning_rate": 8.582763671875e-07, "loss": 0.0019, "reward": 1.7947958111763, "reward_std": 0.08994543924927711, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8026082813739777, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 305.59375, "epoch": 0.5673828125, "grad_norm": 1.9920746576348605, "kl": 0.045166015625, "learning_rate": 8.58154296875e-07, "loss": 0.0018, "reward": 1.666768193244934, "reward_std": 0.09091871604323387, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6745807230472565, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 281.375, "epoch": 0.56787109375, "grad_norm": 2.2336338961392954, "kl": 0.0648193359375, "learning_rate": 8.580322265625e-07, "loss": 0.0026, "reward": 1.5089243650436401, "reward_std": 0.1571076586842537, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5245493352413177, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 322.21875, "epoch": 0.568359375, "grad_norm": 1.6665465636302936, "kl": 0.046142578125, "learning_rate": 8.579101562499999e-07, "loss": 0.0018, "reward": 1.6286611557006836, "reward_std": 0.09474155679345131, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6442860960960388, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 257.5234375, "epoch": 0.56884765625, "grad_norm": 2.6701091262123073, "kl": 0.051025390625, "learning_rate": 8.577880859374999e-07, "loss": 0.002, "reward": 1.7830933332443237, "reward_std": 0.11209750175476074, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.790905773639679, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 366.9921875, "epoch": 0.5693359375, "grad_norm": 1.289301287486059, "kl": 0.03955078125, "learning_rate": 8.576660156249999e-07, "loss": 0.0016, "reward": 1.768127977848053, "reward_std": 0.11993209552019835, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.783752977848053, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 307.171875, "epoch": 0.56982421875, "grad_norm": 1.9298565392126688, "kl": 0.04833984375, "learning_rate": 8.575439453125e-07, "loss": 0.0019, "reward": 1.7926509380340576, "reward_std": 0.08671310544013977, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8004634380340576, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 262.203125, "epoch": 0.5703125, "grad_norm": 2.3663116967309654, "kl": 0.0501708984375, "learning_rate": 8.57421875e-07, "loss": 0.002, "reward": 1.712095022201538, "reward_std": 0.13929815590381622, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7199075222015381, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 322.5078125, "epoch": 0.57080078125, "grad_norm": 1.7141572985706985, "kl": 0.0439453125, "learning_rate": 8.572998046875e-07, "loss": 0.0018, "reward": 1.6946678757667542, "reward_std": 0.0815641526132822, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7024803757667542, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 398.078125, "epoch": 0.5712890625, "grad_norm": 3.523924490731831, "kl": 0.0640869140625, "learning_rate": 8.57177734375e-07, "loss": 0.0026, "reward": 1.6033125519752502, "reward_std": 0.16777217388153076, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6423750221729279, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 306.8359375, "epoch": 0.57177734375, "grad_norm": 1.2742692467315286, "kl": 0.0611572265625, "learning_rate": 8.570556640624999e-07, "loss": 0.0024, "reward": 1.6901500225067139, "reward_std": 0.07790947519242764, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6901499927043915, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 286.875, "epoch": 0.572265625, "grad_norm": 1.1420114365574225, "kl": 0.05029296875, "learning_rate": 8.569335937499999e-07, "loss": 0.002, "reward": 1.7600011825561523, "reward_std": 0.04347742348909378, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7600011825561523, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 300.640625, "epoch": 0.57275390625, "grad_norm": 1.9539279563519465, "kl": 0.052978515625, "learning_rate": 8.568115234375e-07, "loss": 0.0021, "reward": 1.744015395641327, "reward_std": 0.04951014555990696, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7440153956413269, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 329.8359375, "epoch": 0.5732421875, "grad_norm": 2.247065979267746, "kl": 0.0506591796875, "learning_rate": 8.56689453125e-07, "loss": 0.002, "reward": 1.649504840373993, "reward_std": 0.12423533946275711, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6495048403739929, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 346.0, "epoch": 0.57373046875, "grad_norm": 1.111441732375308, "kl": 0.0504150390625, "learning_rate": 8.565673828125e-07, "loss": 0.002, "reward": 1.743337869644165, "reward_std": 0.04013508930802345, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7433378994464874, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 386.9609375, "epoch": 0.57421875, "grad_norm": 2.1368401123680476, "kl": 0.0535888671875, "learning_rate": 8.564453125e-07, "loss": 0.0021, "reward": 1.6390271186828613, "reward_std": 0.16274896264076233, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6624647080898285, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 258.6953125, "epoch": 0.57470703125, "grad_norm": 4.624399047687805, "kl": 0.111328125, "learning_rate": 8.563232421874999e-07, "loss": 0.0045, "reward": 1.5596601963043213, "reward_std": 0.0850033089518547, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5596601665019989, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 314.0546875, "epoch": 0.5751953125, "grad_norm": 2.2574279851451604, "kl": 0.0426025390625, "learning_rate": 8.562011718749999e-07, "loss": 0.0017, "reward": 1.732949137687683, "reward_std": 0.036547823809087276, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7329491078853607, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 271.4453125, "epoch": 0.57568359375, "grad_norm": 3.7525805144535487, "kl": 0.0596923828125, "learning_rate": 8.560791015624999e-07, "loss": 0.0024, "reward": 1.653287649154663, "reward_std": 0.11773675680160522, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6532876789569855, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 350.1015625, "epoch": 0.576171875, "grad_norm": 5.2466468575854215, "kl": 0.0731201171875, "learning_rate": 8.5595703125e-07, "loss": 0.0029, "reward": 1.6203824877738953, "reward_std": 0.12023291178047657, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6594450175762177, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 277.7890625, "epoch": 0.57666015625, "grad_norm": 2.0042003028636453, "kl": 0.0506591796875, "learning_rate": 8.558349609375e-07, "loss": 0.002, "reward": 1.6139835119247437, "reward_std": 0.12955578044056892, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.621796041727066, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 364.6328125, "epoch": 0.5771484375, "grad_norm": 1.827852163834602, "kl": 0.0418701171875, "learning_rate": 8.55712890625e-07, "loss": 0.0017, "reward": 1.7956476211547852, "reward_std": 0.07434218749403954, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8034601211547852, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 313.1875, "epoch": 0.57763671875, "grad_norm": 1.4216545847402544, "kl": 0.04736328125, "learning_rate": 8.555908203125e-07, "loss": 0.0019, "reward": 1.7386606931686401, "reward_std": 0.06302103772759438, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7386606633663177, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 281.140625, "epoch": 0.578125, "grad_norm": 1.436370223265228, "kl": 0.077880859375, "learning_rate": 8.554687499999999e-07, "loss": 0.0031, "reward": 1.8124098181724548, "reward_std": 0.04216676577925682, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8124098777770996, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 273.890625, "epoch": 0.57861328125, "grad_norm": 1.0587591031913912, "kl": 0.060546875, "learning_rate": 8.553466796874999e-07, "loss": 0.0024, "reward": 1.7749245166778564, "reward_std": 0.07318684877827764, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7827369868755341, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 264.8125, "epoch": 0.5791015625, "grad_norm": 1.7255970036733406, "kl": 0.0589599609375, "learning_rate": 8.55224609375e-07, "loss": 0.0024, "reward": 1.7037720680236816, "reward_std": 0.05063655413687229, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7115845680236816, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 367.875, "epoch": 0.57958984375, "grad_norm": 2.1334824778608232, "kl": 0.061767578125, "learning_rate": 8.551025390625e-07, "loss": 0.0025, "reward": 1.7053207755088806, "reward_std": 0.20426107943058014, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7209457159042358, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 322.46875, "epoch": 0.580078125, "grad_norm": 24.364302338249043, "kl": 0.05615234375, "learning_rate": 8.5498046875e-07, "loss": 0.0022, "reward": 1.665935754776001, "reward_std": 0.116399385035038, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6737483143806458, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 231.0390625, "epoch": 0.58056640625, "grad_norm": 2.7033766379682644, "kl": 0.0628662109375, "learning_rate": 8.548583984375e-07, "loss": 0.0025, "reward": 1.8331878185272217, "reward_std": 0.05261234473437071, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8331877589225769, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 250.7265625, "epoch": 0.5810546875, "grad_norm": 17.90905176018613, "kl": 0.0579833984375, "learning_rate": 8.547363281249999e-07, "loss": 0.0023, "reward": 1.6796801686286926, "reward_std": 0.09237649664282799, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6874926686286926, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 288.078125, "epoch": 0.58154296875, "grad_norm": 2.1262717212577313, "kl": 0.0618896484375, "learning_rate": 8.546142578124999e-07, "loss": 0.0025, "reward": 1.6501364707946777, "reward_std": 0.0826064795255661, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6501363515853882, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 236.3125, "epoch": 0.58203125, "grad_norm": 1.4136272078115328, "kl": 0.047607421875, "learning_rate": 8.544921874999999e-07, "loss": 0.0019, "reward": 1.8067971467971802, "reward_std": 0.03229185566306114, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8067971765995026, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 295.8046875, "epoch": 0.58251953125, "grad_norm": 2.047535780569528, "kl": 0.0599365234375, "learning_rate": 8.543701171875e-07, "loss": 0.0024, "reward": 1.6728439927101135, "reward_std": 0.1287621632218361, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6884690225124359, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 256.59375, "epoch": 0.5830078125, "grad_norm": 1.557581460070875, "kl": 0.07373046875, "learning_rate": 8.54248046875e-07, "loss": 0.0029, "reward": 1.7042672038078308, "reward_std": 0.13340860605239868, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7198922336101532, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 325.5078125, "epoch": 0.58349609375, "grad_norm": 2.4518824995762207, "kl": 0.05712890625, "learning_rate": 8.541259765625e-07, "loss": 0.0023, "reward": 1.6440874338150024, "reward_std": 0.14590797573328018, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6987749636173248, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 389.21875, "epoch": 0.583984375, "grad_norm": 7.367664763550004, "kl": 0.048095703125, "learning_rate": 8.5400390625e-07, "loss": 0.0019, "reward": 1.708031952381134, "reward_std": 0.13708262518048286, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.723656952381134, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 389.921875, "epoch": 0.58447265625, "grad_norm": 5.15694588705896, "kl": 0.056884765625, "learning_rate": 8.538818359374999e-07, "loss": 0.0023, "reward": 1.557603120803833, "reward_std": 0.18934200704097748, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5888532102108002, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 247.828125, "epoch": 0.5849609375, "grad_norm": 8.59471617688442, "kl": 0.06298828125, "learning_rate": 8.537597656249999e-07, "loss": 0.0025, "reward": 1.7653963565826416, "reward_std": 0.06373865529894829, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7653963565826416, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 424.328125, "epoch": 0.58544921875, "grad_norm": 1.5188060960183658, "kl": 0.0498046875, "learning_rate": 8.536376953125e-07, "loss": 0.002, "reward": 1.5980682969093323, "reward_std": 0.14940915256738663, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6371308267116547, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 244.234375, "epoch": 0.5859375, "grad_norm": 3.9345332601410505, "kl": 0.0758056640625, "learning_rate": 8.53515625e-07, "loss": 0.003, "reward": 1.6427003741264343, "reward_std": 0.07084774971008301, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6427003443241119, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 285.0390625, "epoch": 0.58642578125, "grad_norm": 1.6826818386421434, "kl": 0.07373046875, "learning_rate": 8.533935546875e-07, "loss": 0.003, "reward": 1.664411723613739, "reward_std": 0.16828951984643936, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6878492832183838, "step": 1201 }, { "clip_ratio": 0.0, "completion_length": 358.9765625, "epoch": 0.5869140625, "grad_norm": 3.836908468134239, "kl": 0.05517578125, "learning_rate": 8.53271484375e-07, "loss": 0.0022, "reward": 1.614501714706421, "reward_std": 0.20206372626125813, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6535641849040985, "step": 1202 }, { "clip_ratio": 0.0, "completion_length": 297.9453125, "epoch": 0.58740234375, "grad_norm": 2.9305798211200287, "kl": 0.0498046875, "learning_rate": 8.531494140624999e-07, "loss": 0.002, "reward": 1.7278985977172852, "reward_std": 0.11820728331804276, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7435235381126404, "step": 1203 }, { "clip_ratio": 0.0, "completion_length": 301.671875, "epoch": 0.587890625, "grad_norm": 2.5378734213626157, "kl": 0.056640625, "learning_rate": 8.530273437499999e-07, "loss": 0.0023, "reward": 1.7331331968307495, "reward_std": 0.12400734424591064, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7409456968307495, "step": 1204 }, { "clip_ratio": 0.0, "completion_length": 295.3984375, "epoch": 0.58837890625, "grad_norm": 2.6673697152137237, "kl": 0.0645751953125, "learning_rate": 8.529052734374999e-07, "loss": 0.0026, "reward": 1.7612251043319702, "reward_std": 0.07167639397084713, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.761225014925003, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 306.84375, "epoch": 0.5888671875, "grad_norm": 1.282713456349074, "kl": 0.06201171875, "learning_rate": 8.52783203125e-07, "loss": 0.0025, "reward": 1.7675436735153198, "reward_std": 0.06219838559627533, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7675436437129974, "step": 1206 }, { "clip_ratio": 0.0, "completion_length": 262.609375, "epoch": 0.58935546875, "grad_norm": 1.6085902533879708, "kl": 0.073486328125, "learning_rate": 8.526611328125e-07, "loss": 0.0029, "reward": 1.7036328315734863, "reward_std": 0.07522736862301826, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7114453315734863, "step": 1207 }, { "clip_ratio": 0.0, "completion_length": 302.15625, "epoch": 0.58984375, "grad_norm": 1.4976152138206247, "kl": 0.0673828125, "learning_rate": 8.525390625e-07, "loss": 0.0027, "reward": 1.6411904096603394, "reward_std": 0.09779999405145645, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.649002879858017, "step": 1208 }, { "clip_ratio": 0.0, "completion_length": 340.6875, "epoch": 0.59033203125, "grad_norm": 1.346538056309748, "kl": 0.0423583984375, "learning_rate": 8.524169921875e-07, "loss": 0.0017, "reward": 1.7001066207885742, "reward_std": 0.1550520807504654, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7235440611839294, "step": 1209 }, { "clip_ratio": 0.0, "completion_length": 356.0, "epoch": 0.5908203125, "grad_norm": 1.7030655116642164, "kl": 0.05322265625, "learning_rate": 8.522949218749999e-07, "loss": 0.0021, "reward": 1.6805211901664734, "reward_std": 0.15443892404437065, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7195836007595062, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 317.15625, "epoch": 0.59130859375, "grad_norm": 3.015177679321377, "kl": 0.0623779296875, "learning_rate": 8.521728515624999e-07, "loss": 0.0025, "reward": 1.6988528370857239, "reward_std": 0.1044110469520092, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7144778072834015, "step": 1211 }, { "clip_ratio": 0.0, "completion_length": 383.34375, "epoch": 0.591796875, "grad_norm": 1.183423081882842, "kl": 0.06494140625, "learning_rate": 8.5205078125e-07, "loss": 0.0026, "reward": 1.6010417938232422, "reward_std": 0.14411456137895584, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6244792938232422, "step": 1212 }, { "clip_ratio": 0.0, "completion_length": 324.4921875, "epoch": 0.59228515625, "grad_norm": 3.816793739656266, "kl": 0.117919921875, "learning_rate": 8.519287109375e-07, "loss": 0.0047, "reward": 1.7807039022445679, "reward_std": 0.11940962262451649, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7885164320468903, "step": 1213 }, { "clip_ratio": 0.0, "completion_length": 302.96875, "epoch": 0.5927734375, "grad_norm": 1.2559013257637506, "kl": 0.05029296875, "learning_rate": 8.51806640625e-07, "loss": 0.002, "reward": 1.7532151341438293, "reward_std": 0.09881580621004105, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7688401639461517, "step": 1214 }, { "clip_ratio": 0.0, "completion_length": 258.6171875, "epoch": 0.59326171875, "grad_norm": 3.6468537542322106, "kl": 0.072509765625, "learning_rate": 8.516845703125e-07, "loss": 0.0029, "reward": 1.6600202918052673, "reward_std": 0.061623964458703995, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6600202918052673, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 304.5859375, "epoch": 0.59375, "grad_norm": 10.981884504101897, "kl": 0.05712890625, "learning_rate": 8.515624999999999e-07, "loss": 0.0023, "reward": 1.701629638671875, "reward_std": 0.0891575813293457, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7016295790672302, "step": 1216 }, { "clip_ratio": 0.0, "completion_length": 344.890625, "epoch": 0.59423828125, "grad_norm": 1.4336654046512236, "kl": 0.0540771484375, "learning_rate": 8.514404296874999e-07, "loss": 0.0022, "reward": 1.7428684830665588, "reward_std": 0.08639609813690186, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7506809830665588, "step": 1217 }, { "clip_ratio": 0.0, "completion_length": 215.171875, "epoch": 0.5947265625, "grad_norm": 2.776622750108565, "kl": 0.0618896484375, "learning_rate": 8.513183593749999e-07, "loss": 0.0025, "reward": 1.8936978578567505, "reward_std": 0.03348179440945387, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8936978578567505, "step": 1218 }, { "clip_ratio": 0.0, "completion_length": 342.7734375, "epoch": 0.59521484375, "grad_norm": 1.8814077810205045, "kl": 0.0614013671875, "learning_rate": 8.511962890625e-07, "loss": 0.0025, "reward": 1.7222504615783691, "reward_std": 0.10825235769152641, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7378754019737244, "step": 1219 }, { "clip_ratio": 0.0, "completion_length": 276.734375, "epoch": 0.595703125, "grad_norm": 4.9487866605632975, "kl": 0.050048828125, "learning_rate": 8.5107421875e-07, "loss": 0.002, "reward": 1.6460736989974976, "reward_std": 0.09841511398553848, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6538861691951752, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 298.9765625, "epoch": 0.59619140625, "grad_norm": 3.162686127759236, "kl": 0.065673828125, "learning_rate": 8.509521484375e-07, "loss": 0.0026, "reward": 1.6684702634811401, "reward_std": 0.10185368359088898, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.676282674074173, "step": 1221 }, { "clip_ratio": 0.0, "completion_length": 365.0078125, "epoch": 0.5966796875, "grad_norm": 1.9616464372870683, "kl": 0.0635986328125, "learning_rate": 8.50830078125e-07, "loss": 0.0025, "reward": 1.640607476234436, "reward_std": 0.14903107285499573, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6562323570251465, "step": 1222 }, { "clip_ratio": 0.0, "completion_length": 297.4921875, "epoch": 0.59716796875, "grad_norm": 4.351483415683568, "kl": 0.056396484375, "learning_rate": 8.507080078124999e-07, "loss": 0.0023, "reward": 1.6847114562988281, "reward_std": 0.060587236657738686, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6847114562988281, "step": 1223 }, { "clip_ratio": 0.0, "completion_length": 433.6875, "epoch": 0.59765625, "grad_norm": 1.0863930931712735, "kl": 0.0416259765625, "learning_rate": 8.505859374999999e-07, "loss": 0.0017, "reward": 1.7114101648330688, "reward_std": 0.1884886771440506, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7348476648330688, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 272.0234375, "epoch": 0.59814453125, "grad_norm": 4.182483948402709, "kl": 0.060791015625, "learning_rate": 8.504638671875e-07, "loss": 0.0024, "reward": 1.8124624490737915, "reward_std": 0.06934082508087158, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8124624788761139, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 265.5703125, "epoch": 0.5986328125, "grad_norm": 1.8143814977652135, "kl": 0.05712890625, "learning_rate": 8.50341796875e-07, "loss": 0.0023, "reward": 1.7989888787269592, "reward_std": 0.06971035525202751, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7989888489246368, "step": 1226 }, { "clip_ratio": 0.0, "completion_length": 295.265625, "epoch": 0.59912109375, "grad_norm": 1.3565201503680722, "kl": 0.0521240234375, "learning_rate": 8.502197265625e-07, "loss": 0.0021, "reward": 1.723208248615265, "reward_std": 0.07503095269203186, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7310207486152649, "step": 1227 }, { "clip_ratio": 0.0, "completion_length": 347.375, "epoch": 0.599609375, "grad_norm": 2.093422094136838, "kl": 0.0616455078125, "learning_rate": 8.5009765625e-07, "loss": 0.0025, "reward": 1.780647873878479, "reward_std": 0.05587127059698105, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7806479036808014, "step": 1228 }, { "clip_ratio": 0.0, "completion_length": 284.6328125, "epoch": 0.60009765625, "grad_norm": 0.9638564944473897, "kl": 0.04931640625, "learning_rate": 8.499755859375e-07, "loss": 0.002, "reward": 1.8005688786506653, "reward_std": 0.034580922685563564, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8005689084529877, "step": 1229 }, { "clip_ratio": 0.0, "completion_length": 302.4609375, "epoch": 0.6005859375, "grad_norm": 1.7588410237791303, "kl": 0.0498046875, "learning_rate": 8.498535156249999e-07, "loss": 0.002, "reward": 1.7482208609580994, "reward_std": 0.11355694010853767, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7560333609580994, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 230.7109375, "epoch": 0.60107421875, "grad_norm": 5.4986353498353235, "kl": 0.0703125, "learning_rate": 8.497314453124999e-07, "loss": 0.0028, "reward": 1.8072319626808167, "reward_std": 0.09327958524227142, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.822856992483139, "step": 1231 }, { "clip_ratio": 0.0, "completion_length": 435.9609375, "epoch": 0.6015625, "grad_norm": 1.5860451056769687, "kl": 0.05078125, "learning_rate": 8.49609375e-07, "loss": 0.002, "reward": 1.631383240222931, "reward_std": 0.1514478251338005, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6548207402229309, "step": 1232 }, { "clip_ratio": 0.0, "completion_length": 247.2109375, "epoch": 0.60205078125, "grad_norm": 5.160310522295723, "kl": 0.064697265625, "learning_rate": 8.494873046875e-07, "loss": 0.0026, "reward": 1.824979543685913, "reward_std": 0.06196466274559498, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8249796032905579, "step": 1233 }, { "clip_ratio": 0.0, "completion_length": 242.578125, "epoch": 0.6025390625, "grad_norm": 3.0896098116019375, "kl": 0.068603515625, "learning_rate": 8.49365234375e-07, "loss": 0.0027, "reward": 1.6698785424232483, "reward_std": 0.17456145584583282, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6776910722255707, "step": 1234 }, { "clip_ratio": 0.0, "completion_length": 263.078125, "epoch": 0.60302734375, "grad_norm": 2.3461866135262275, "kl": 0.0556640625, "learning_rate": 8.492431640625e-07, "loss": 0.0022, "reward": 1.726996123790741, "reward_std": 0.07286924868822098, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7269961833953857, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 338.4765625, "epoch": 0.603515625, "grad_norm": 3.2008131777902302, "kl": 0.076171875, "learning_rate": 8.491210937499999e-07, "loss": 0.0031, "reward": 1.5925570726394653, "reward_std": 0.13758273422718048, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6159945428371429, "step": 1236 }, { "clip_ratio": 0.0, "completion_length": 328.09375, "epoch": 0.60400390625, "grad_norm": 1.0869290753732679, "kl": 0.0523681640625, "learning_rate": 8.489990234374999e-07, "loss": 0.0021, "reward": 1.6631884574890137, "reward_std": 0.1325419619679451, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6944384574890137, "step": 1237 }, { "clip_ratio": 0.0, "completion_length": 260.390625, "epoch": 0.6044921875, "grad_norm": 1.043664689306585, "kl": 0.0596923828125, "learning_rate": 8.48876953125e-07, "loss": 0.0024, "reward": 1.7231544256210327, "reward_std": 0.029385648667812347, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7231544256210327, "step": 1238 }, { "clip_ratio": 0.0, "completion_length": 261.046875, "epoch": 0.60498046875, "grad_norm": 1.9882566923730944, "kl": 0.0679931640625, "learning_rate": 8.487548828125e-07, "loss": 0.0027, "reward": 1.755352258682251, "reward_std": 0.050556398928165436, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.755352258682251, "step": 1239 }, { "clip_ratio": 0.0, "completion_length": 291.1875, "epoch": 0.60546875, "grad_norm": 0.9718933967102993, "kl": 0.0498046875, "learning_rate": 8.486328125e-07, "loss": 0.002, "reward": 1.7807682752609253, "reward_std": 0.05785749014467001, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7807681560516357, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 386.15625, "epoch": 0.60595703125, "grad_norm": 1.0864992081357003, "kl": 0.0439453125, "learning_rate": 8.485107421875e-07, "loss": 0.0018, "reward": 1.8006258606910706, "reward_std": 0.10033701360225677, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8162508606910706, "step": 1241 }, { "clip_ratio": 0.0, "completion_length": 279.28125, "epoch": 0.6064453125, "grad_norm": 0.72379267103834, "kl": 0.0555419921875, "learning_rate": 8.48388671875e-07, "loss": 0.0022, "reward": 1.7179449796676636, "reward_std": 0.10580763639882207, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7491949796676636, "step": 1242 }, { "clip_ratio": 0.0, "completion_length": 297.1953125, "epoch": 0.60693359375, "grad_norm": 6.134340243573936, "kl": 0.056640625, "learning_rate": 8.482666015624999e-07, "loss": 0.0023, "reward": 1.6223798394203186, "reward_std": 0.08309117332100868, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6223797798156738, "step": 1243 }, { "clip_ratio": 0.0, "completion_length": 261.03125, "epoch": 0.607421875, "grad_norm": 2.981575045817795, "kl": 0.06689453125, "learning_rate": 8.481445312499999e-07, "loss": 0.0027, "reward": 1.5559495091438293, "reward_std": 0.12914244830608368, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.571574479341507, "step": 1244 }, { "clip_ratio": 0.0, "completion_length": 331.5703125, "epoch": 0.60791015625, "grad_norm": 1.167173976520968, "kl": 0.0479736328125, "learning_rate": 8.480224609375e-07, "loss": 0.0019, "reward": 1.8290737867355347, "reward_std": 0.09445438906550407, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8446987867355347, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 380.4921875, "epoch": 0.6083984375, "grad_norm": 2.3473086338411733, "kl": 0.041259765625, "learning_rate": 8.47900390625e-07, "loss": 0.0017, "reward": 1.7552416920661926, "reward_std": 0.0686273779720068, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7552417516708374, "step": 1246 }, { "clip_ratio": 0.0, "completion_length": 433.984375, "epoch": 0.60888671875, "grad_norm": 1.989690821238528, "kl": 0.0418701171875, "learning_rate": 8.477783203125e-07, "loss": 0.0017, "reward": 1.7533529996871948, "reward_std": 0.08260135725140572, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7533529996871948, "step": 1247 }, { "clip_ratio": 0.0, "completion_length": 392.46875, "epoch": 0.609375, "grad_norm": 4.174383689747545, "kl": 0.0361328125, "learning_rate": 8.4765625e-07, "loss": 0.0014, "reward": 1.7376724481582642, "reward_std": 0.12889265269041061, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7532974779605865, "step": 1248 }, { "clip_ratio": 0.0, "completion_length": 316.9375, "epoch": 0.60986328125, "grad_norm": 1.41606671516539, "kl": 0.0467529296875, "learning_rate": 8.475341796874999e-07, "loss": 0.0019, "reward": 1.7911608219146729, "reward_std": 0.05567508563399315, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7911607921123505, "step": 1249 }, { "clip_ratio": 0.0, "completion_length": 277.8046875, "epoch": 0.6103515625, "grad_norm": 1.3841122351592616, "kl": 0.0531005859375, "learning_rate": 8.474121093749999e-07, "loss": 0.0021, "reward": 1.8352203965187073, "reward_std": 0.06681127846240997, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8352203369140625, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 255.4140625, "epoch": 0.61083984375, "grad_norm": 1.6245690807772388, "kl": 0.054443359375, "learning_rate": 8.472900390624999e-07, "loss": 0.0022, "reward": 1.83678537607193, "reward_std": 0.0678851343691349, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8367853164672852, "step": 1251 }, { "clip_ratio": 0.0, "completion_length": 317.6015625, "epoch": 0.611328125, "grad_norm": 1.6847918430394793, "kl": 0.0633544921875, "learning_rate": 8.4716796875e-07, "loss": 0.0025, "reward": 1.6871824860572815, "reward_std": 0.12895482033491135, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6949949860572815, "step": 1252 }, { "clip_ratio": 0.0, "completion_length": 270.8671875, "epoch": 0.61181640625, "grad_norm": 1.9808612056878117, "kl": 0.047119140625, "learning_rate": 8.470458984375e-07, "loss": 0.0019, "reward": 1.8585364818572998, "reward_std": 0.05595720373094082, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8585363328456879, "step": 1253 }, { "clip_ratio": 0.0, "completion_length": 257.1953125, "epoch": 0.6123046875, "grad_norm": 1.023636015557283, "kl": 0.0531005859375, "learning_rate": 8.46923828125e-07, "loss": 0.0021, "reward": 1.7553237080574036, "reward_std": 0.017358798999339342, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7553236782550812, "step": 1254 }, { "clip_ratio": 0.0, "completion_length": 272.390625, "epoch": 0.61279296875, "grad_norm": 2.129666626613869, "kl": 0.0482177734375, "learning_rate": 8.468017578125e-07, "loss": 0.0019, "reward": 1.830765187740326, "reward_std": 0.06046690791845322, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8307652175426483, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 270.875, "epoch": 0.61328125, "grad_norm": 5.092569077503816, "kl": 0.072021484375, "learning_rate": 8.466796874999999e-07, "loss": 0.0029, "reward": 1.7521468997001648, "reward_std": 0.11612342670559883, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.75995934009552, "step": 1256 }, { "clip_ratio": 0.0, "completion_length": 236.5859375, "epoch": 0.61376953125, "grad_norm": 2.542365050667952, "kl": 0.06787109375, "learning_rate": 8.465576171874999e-07, "loss": 0.0027, "reward": 1.8093348741531372, "reward_std": 0.05134081654250622, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8093349635601044, "step": 1257 }, { "clip_ratio": 0.0, "completion_length": 228.34375, "epoch": 0.6142578125, "grad_norm": 2.0593288502085576, "kl": 0.0782470703125, "learning_rate": 8.46435546875e-07, "loss": 0.0031, "reward": 1.7897635102272034, "reward_std": 0.08628207445144653, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7975760698318481, "step": 1258 }, { "clip_ratio": 0.0, "completion_length": 285.0390625, "epoch": 0.61474609375, "grad_norm": 1.1846536082668164, "kl": 0.0538330078125, "learning_rate": 8.463134765625e-07, "loss": 0.0022, "reward": 1.6509920954704285, "reward_std": 0.08322879299521446, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6588045656681061, "step": 1259 }, { "clip_ratio": 0.0, "completion_length": 340.6796875, "epoch": 0.615234375, "grad_norm": 9.144569025352508, "kl": 0.0601806640625, "learning_rate": 8.4619140625e-07, "loss": 0.0024, "reward": 1.7124608755111694, "reward_std": 0.06501621380448341, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7124608755111694, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 268.5546875, "epoch": 0.61572265625, "grad_norm": 10.55754939249481, "kl": 0.060302734375, "learning_rate": 8.460693359375e-07, "loss": 0.0024, "reward": 1.7932811379432678, "reward_std": 0.07128015346825123, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7932811081409454, "step": 1261 }, { "clip_ratio": 0.0, "completion_length": 191.984375, "epoch": 0.6162109375, "grad_norm": 71.34719881454178, "kl": 0.06298828125, "learning_rate": 8.459472656249999e-07, "loss": 0.0025, "reward": 1.7777682542800903, "reward_std": 0.025185417383909225, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7777682244777679, "step": 1262 }, { "clip_ratio": 0.0, "completion_length": 273.7578125, "epoch": 0.61669921875, "grad_norm": 1.978131656908925, "kl": 0.061767578125, "learning_rate": 8.458251953124999e-07, "loss": 0.0025, "reward": 1.6713601350784302, "reward_std": 0.12586339935660362, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6869851052761078, "step": 1263 }, { "clip_ratio": 0.0, "completion_length": 266.6796875, "epoch": 0.6171875, "grad_norm": 1.5988305057821919, "kl": 0.0751953125, "learning_rate": 8.457031249999999e-07, "loss": 0.003, "reward": 1.75057852268219, "reward_std": 0.0573820099234581, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7505785524845123, "step": 1264 }, { "clip_ratio": 0.0, "completion_length": 295.5546875, "epoch": 0.61767578125, "grad_norm": 1.4597428566631678, "kl": 0.0667724609375, "learning_rate": 8.455810546875e-07, "loss": 0.0027, "reward": 1.759526014328003, "reward_std": 0.08681388199329376, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7673385143280029, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 228.53125, "epoch": 0.6181640625, "grad_norm": 3.422813642948489, "kl": 0.08837890625, "learning_rate": 8.45458984375e-07, "loss": 0.0035, "reward": 1.8277055025100708, "reward_std": 0.08797085843980312, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8355179727077484, "step": 1266 }, { "clip_ratio": 0.0, "completion_length": 286.109375, "epoch": 0.61865234375, "grad_norm": 1.3015209293279653, "kl": 0.0609130859375, "learning_rate": 8.453369140625e-07, "loss": 0.0024, "reward": 1.7601238489151, "reward_std": 0.03359607141464949, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7601238191127777, "step": 1267 }, { "clip_ratio": 0.0, "completion_length": 223.8828125, "epoch": 0.619140625, "grad_norm": 3.1082145060006856, "kl": 0.0732421875, "learning_rate": 8.4521484375e-07, "loss": 0.0029, "reward": 1.5394993424415588, "reward_std": 0.0939161665737629, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5394993126392365, "step": 1268 }, { "clip_ratio": 0.0, "completion_length": 254.5078125, "epoch": 0.61962890625, "grad_norm": 1.281242601050256, "kl": 0.0672607421875, "learning_rate": 8.450927734374999e-07, "loss": 0.0027, "reward": 1.7604122757911682, "reward_std": 0.08974438905715942, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7604122757911682, "step": 1269 }, { "clip_ratio": 0.0, "completion_length": 342.515625, "epoch": 0.6201171875, "grad_norm": 3.739110086504897, "kl": 0.05712890625, "learning_rate": 8.449707031249999e-07, "loss": 0.0023, "reward": 1.8326544761657715, "reward_std": 0.05452083423733711, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8326544463634491, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 248.171875, "epoch": 0.62060546875, "grad_norm": 2.0306719948385865, "kl": 0.06689453125, "learning_rate": 8.448486328125e-07, "loss": 0.0027, "reward": 1.6453559398651123, "reward_std": 0.06446324661374092, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6453558802604675, "step": 1271 }, { "clip_ratio": 0.0, "completion_length": 259.484375, "epoch": 0.62109375, "grad_norm": 2.743793220567594, "kl": 0.0662841796875, "learning_rate": 8.447265625e-07, "loss": 0.0026, "reward": 1.7001383304595947, "reward_std": 0.09162449836730957, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7001383602619171, "step": 1272 }, { "clip_ratio": 0.0, "completion_length": 245.734375, "epoch": 0.62158203125, "grad_norm": 2.553544875125356, "kl": 0.0584716796875, "learning_rate": 8.446044921875e-07, "loss": 0.0023, "reward": 1.5885206460952759, "reward_std": 0.047395724803209305, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5885206460952759, "step": 1273 }, { "clip_ratio": 0.0, "completion_length": 193.375, "epoch": 0.6220703125, "grad_norm": 0.8661435940898954, "kl": 0.057373046875, "learning_rate": 8.44482421875e-07, "loss": 0.0023, "reward": 1.8218601942062378, "reward_std": 0.047537509351968765, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.821860134601593, "step": 1274 }, { "clip_ratio": 0.0, "completion_length": 244.9140625, "epoch": 0.62255859375, "grad_norm": 1.8229102343241281, "kl": 0.0611572265625, "learning_rate": 8.443603515624999e-07, "loss": 0.0024, "reward": 1.7554479241371155, "reward_std": 0.06095794588327408, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7554478943347931, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 234.875, "epoch": 0.623046875, "grad_norm": 5.054715442884952, "kl": 0.056640625, "learning_rate": 8.442382812499999e-07, "loss": 0.0023, "reward": 1.698002815246582, "reward_std": 0.1263159103691578, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.705815315246582, "step": 1276 }, { "clip_ratio": 0.0, "completion_length": 279.109375, "epoch": 0.62353515625, "grad_norm": 1.3577851009475093, "kl": 0.05859375, "learning_rate": 8.441162109374999e-07, "loss": 0.0023, "reward": 1.7819681763648987, "reward_std": 0.05448628589510918, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7819681465625763, "step": 1277 }, { "clip_ratio": 0.0, "completion_length": 323.9609375, "epoch": 0.6240234375, "grad_norm": 1.9557945914662318, "kl": 0.0556640625, "learning_rate": 8.43994140625e-07, "loss": 0.0022, "reward": 1.5706565976142883, "reward_std": 0.10779277980327606, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5784690678119659, "step": 1278 }, { "clip_ratio": 0.0, "completion_length": 191.9609375, "epoch": 0.62451171875, "grad_norm": 9.30380292114862, "kl": 0.053955078125, "learning_rate": 8.438720703125e-07, "loss": 0.0022, "reward": 1.780591070652008, "reward_std": 0.047663201577961445, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7805911004543304, "step": 1279 }, { "clip_ratio": 0.0, "completion_length": 314.7109375, "epoch": 0.625, "grad_norm": 1.1932154787420253, "kl": 0.0540771484375, "learning_rate": 8.4375e-07, "loss": 0.0022, "reward": 1.7884827852249146, "reward_std": 0.07211063336580992, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7962952554225922, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 289.7265625, "epoch": 0.62548828125, "grad_norm": 1.9144731486079674, "kl": 0.03955078125, "learning_rate": 8.436279296875e-07, "loss": 0.0016, "reward": 1.8027944564819336, "reward_std": 0.04255840554833412, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.802794486284256, "step": 1281 }, { "clip_ratio": 0.0, "completion_length": 322.203125, "epoch": 0.6259765625, "grad_norm": 2.772928558355846, "kl": 0.0596923828125, "learning_rate": 8.435058593749999e-07, "loss": 0.0024, "reward": 1.731988787651062, "reward_std": 0.09315790981054306, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7319887578487396, "step": 1282 }, { "clip_ratio": 0.0, "completion_length": 380.1953125, "epoch": 0.62646484375, "grad_norm": 9.787372043948942, "kl": 0.0592041015625, "learning_rate": 8.433837890624999e-07, "loss": 0.0024, "reward": 1.7648069858551025, "reward_std": 0.03516199626028538, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7648070156574249, "step": 1283 }, { "clip_ratio": 0.0, "completion_length": 394.4765625, "epoch": 0.626953125, "grad_norm": 3.356579451338363, "kl": 0.05126953125, "learning_rate": 8.4326171875e-07, "loss": 0.0021, "reward": 1.696887195110321, "reward_std": 0.05591726675629616, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6968871355056763, "step": 1284 }, { "clip_ratio": 0.0, "completion_length": 346.6015625, "epoch": 0.62744140625, "grad_norm": 2.549785582844497, "kl": 0.0439453125, "learning_rate": 8.431396484375e-07, "loss": 0.0018, "reward": 1.7408929467201233, "reward_std": 0.11647412180900574, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7487053871154785, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 354.4609375, "epoch": 0.6279296875, "grad_norm": 0.9021938993838211, "kl": 0.0540771484375, "learning_rate": 8.43017578125e-07, "loss": 0.0022, "reward": 1.758631408214569, "reward_std": 0.126407902687788, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7820688784122467, "step": 1286 }, { "clip_ratio": 0.0, "completion_length": 344.09375, "epoch": 0.62841796875, "grad_norm": 1.3267018883224448, "kl": 0.0491943359375, "learning_rate": 8.428955078125e-07, "loss": 0.002, "reward": 1.632334589958191, "reward_std": 0.0630562799051404, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6401470899581909, "step": 1287 }, { "clip_ratio": 0.0, "completion_length": 296.5625, "epoch": 0.62890625, "grad_norm": 2.2552351150707017, "kl": 0.048828125, "learning_rate": 8.427734374999999e-07, "loss": 0.002, "reward": 1.8420506715774536, "reward_std": 0.1419878453016281, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8498630821704865, "step": 1288 }, { "clip_ratio": 0.0, "completion_length": 293.1171875, "epoch": 0.62939453125, "grad_norm": 3.9365564848072605, "kl": 0.0616455078125, "learning_rate": 8.426513671874999e-07, "loss": 0.0025, "reward": 1.6480534076690674, "reward_std": 0.04998471587896347, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6480533927679062, "step": 1289 }, { "clip_ratio": 0.0, "completion_length": 238.265625, "epoch": 0.6298828125, "grad_norm": 11.182204805928025, "kl": 0.071044921875, "learning_rate": 8.425292968749999e-07, "loss": 0.0028, "reward": 1.7481674551963806, "reward_std": 0.10754155367612839, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7481675148010254, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 283.03125, "epoch": 0.63037109375, "grad_norm": 9.473971973141426, "kl": 0.0615234375, "learning_rate": 8.424072265625e-07, "loss": 0.0025, "reward": 1.7282034158706665, "reward_std": 0.08554265275597572, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7282033860683441, "step": 1291 }, { "clip_ratio": 0.0, "completion_length": 303.9609375, "epoch": 0.630859375, "grad_norm": 1.5297115240623664, "kl": 0.054931640625, "learning_rate": 8.4228515625e-07, "loss": 0.0022, "reward": 1.7187672853469849, "reward_std": 0.06834917794913054, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7265797853469849, "step": 1292 }, { "clip_ratio": 0.0, "completion_length": 323.2890625, "epoch": 0.63134765625, "grad_norm": 1.0070423771790409, "kl": 0.048095703125, "learning_rate": 8.421630859375e-07, "loss": 0.0019, "reward": 1.7618906497955322, "reward_std": 0.0947088971734047, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7697031795978546, "step": 1293 }, { "clip_ratio": 0.0, "completion_length": 361.09375, "epoch": 0.6318359375, "grad_norm": 2.568966029071052, "kl": 0.056396484375, "learning_rate": 8.42041015625e-07, "loss": 0.0023, "reward": 1.6242307424545288, "reward_std": 0.21024633944034576, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.663293182849884, "step": 1294 }, { "clip_ratio": 0.0, "completion_length": 349.21875, "epoch": 0.63232421875, "grad_norm": 1.9848309178933314, "kl": 0.049560546875, "learning_rate": 8.419189453124999e-07, "loss": 0.002, "reward": 1.7481633424758911, "reward_std": 0.12243251502513885, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7716008424758911, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 303.59375, "epoch": 0.6328125, "grad_norm": 0.8761196952724171, "kl": 0.0399169921875, "learning_rate": 8.417968749999999e-07, "loss": 0.0016, "reward": 1.7893099188804626, "reward_std": 0.05286476016044617, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7893097996711731, "step": 1296 }, { "clip_ratio": 0.0, "completion_length": 265.4296875, "epoch": 0.63330078125, "grad_norm": 1.1095061436724811, "kl": 0.0584716796875, "learning_rate": 8.416748046875e-07, "loss": 0.0023, "reward": 1.7433611750602722, "reward_std": 0.061368606984615326, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7433610558509827, "step": 1297 }, { "clip_ratio": 0.0, "completion_length": 312.3125, "epoch": 0.6337890625, "grad_norm": 2.146602454130791, "kl": 0.055419921875, "learning_rate": 8.41552734375e-07, "loss": 0.0022, "reward": 1.7003250122070312, "reward_std": 0.08811355289071798, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7081375122070312, "step": 1298 }, { "clip_ratio": 0.0, "completion_length": 339.0859375, "epoch": 0.63427734375, "grad_norm": 1.165844265780615, "kl": 0.05322265625, "learning_rate": 8.414306640625e-07, "loss": 0.0021, "reward": 1.6436746716499329, "reward_std": 0.0825019795447588, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6436747312545776, "step": 1299 }, { "clip_ratio": 0.0, "completion_length": 328.765625, "epoch": 0.634765625, "grad_norm": 2.5464365422498356, "kl": 0.0504150390625, "learning_rate": 8.4130859375e-07, "loss": 0.002, "reward": 1.7102563381195068, "reward_std": 0.0774708678945899, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7180688977241516, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 202.3984375, "epoch": 0.63525390625, "grad_norm": 2.7704272264732666, "kl": 0.0579833984375, "learning_rate": 8.411865234374999e-07, "loss": 0.0023, "reward": 1.7323628664016724, "reward_std": 0.08244866505265236, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7323628962039948, "step": 1301 }, { "clip_ratio": 0.0, "completion_length": 309.9296875, "epoch": 0.6357421875, "grad_norm": 1.5304947901397634, "kl": 0.0517578125, "learning_rate": 8.410644531249999e-07, "loss": 0.0021, "reward": 1.77633798122406, "reward_std": 0.09737828373908997, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7841504514217377, "step": 1302 }, { "clip_ratio": 0.0, "completion_length": 250.390625, "epoch": 0.63623046875, "grad_norm": 2.9720279627250954, "kl": 0.0599365234375, "learning_rate": 8.409423828124999e-07, "loss": 0.0024, "reward": 1.7398386597633362, "reward_std": 0.06009085476398468, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7398386597633362, "step": 1303 }, { "clip_ratio": 0.0, "completion_length": 330.8359375, "epoch": 0.63671875, "grad_norm": 1.4673255697862866, "kl": 0.06982421875, "learning_rate": 8.408203125e-07, "loss": 0.0028, "reward": 1.6804990768432617, "reward_std": 0.05310311168432236, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6804989874362946, "step": 1304 }, { "clip_ratio": 0.0, "completion_length": 295.875, "epoch": 0.63720703125, "grad_norm": 1.4253383817755978, "kl": 0.057373046875, "learning_rate": 8.406982421875e-07, "loss": 0.0023, "reward": 1.830526053905487, "reward_std": 0.05299729108810425, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8305260539054871, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 396.9140625, "epoch": 0.6376953125, "grad_norm": 2.353756599931053, "kl": 0.047607421875, "learning_rate": 8.40576171875e-07, "loss": 0.0019, "reward": 1.719884991645813, "reward_std": 0.11661730334162712, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.727697491645813, "step": 1306 }, { "clip_ratio": 0.0, "completion_length": 358.5703125, "epoch": 0.63818359375, "grad_norm": 1.3478820253260286, "kl": 0.0499267578125, "learning_rate": 8.404541015625e-07, "loss": 0.002, "reward": 1.6699483394622803, "reward_std": 0.1071729026734829, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6777608394622803, "step": 1307 }, { "clip_ratio": 0.0, "completion_length": 219.5546875, "epoch": 0.638671875, "grad_norm": 1.8256350632514393, "kl": 0.06689453125, "learning_rate": 8.403320312499999e-07, "loss": 0.0027, "reward": 1.8012477159500122, "reward_std": 0.0809515118598938, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8012477159500122, "step": 1308 }, { "clip_ratio": 0.0, "completion_length": 275.3359375, "epoch": 0.63916015625, "grad_norm": 4.051138050422444, "kl": 0.0504150390625, "learning_rate": 8.402099609374999e-07, "loss": 0.002, "reward": 1.8088411688804626, "reward_std": 0.07655365020036697, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8088411688804626, "step": 1309 }, { "clip_ratio": 0.0, "completion_length": 278.5703125, "epoch": 0.6396484375, "grad_norm": 5.526840892766896, "kl": 0.05615234375, "learning_rate": 8.40087890625e-07, "loss": 0.0022, "reward": 1.7295674085617065, "reward_std": 0.06708750128746033, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7295673787593842, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 336.0078125, "epoch": 0.64013671875, "grad_norm": 1.608420576473483, "kl": 0.050537109375, "learning_rate": 8.399658203125e-07, "loss": 0.002, "reward": 1.7628620862960815, "reward_std": 0.0696718655526638, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7706745862960815, "step": 1311 }, { "clip_ratio": 0.0, "completion_length": 267.0, "epoch": 0.640625, "grad_norm": 1.5968411371770552, "kl": 0.057373046875, "learning_rate": 8.3984375e-07, "loss": 0.0023, "reward": 1.652459740638733, "reward_std": 0.06461456045508385, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6524598002433777, "step": 1312 }, { "clip_ratio": 0.0, "completion_length": 315.1875, "epoch": 0.64111328125, "grad_norm": 1.3200737326998475, "kl": 0.054443359375, "learning_rate": 8.397216796875e-07, "loss": 0.0022, "reward": 1.6885486841201782, "reward_std": 0.08856038376688957, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6963611841201782, "step": 1313 }, { "clip_ratio": 0.0, "completion_length": 255.09375, "epoch": 0.6416015625, "grad_norm": 3.6728233406933257, "kl": 0.068603515625, "learning_rate": 8.395996093749999e-07, "loss": 0.0027, "reward": 1.6466514468193054, "reward_std": 0.0514130312949419, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6544639468193054, "step": 1314 }, { "clip_ratio": 0.0, "completion_length": 295.0390625, "epoch": 0.64208984375, "grad_norm": 0.8191385325359082, "kl": 0.0511474609375, "learning_rate": 8.394775390624999e-07, "loss": 0.002, "reward": 1.7703859210014343, "reward_std": 0.0926944687962532, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7860109210014343, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 302.75, "epoch": 0.642578125, "grad_norm": 3.0861231360855617, "kl": 0.0599365234375, "learning_rate": 8.393554687499999e-07, "loss": 0.0024, "reward": 1.6689003109931946, "reward_std": 0.07885997742414474, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6689003109931946, "step": 1316 }, { "clip_ratio": 0.0, "completion_length": 296.296875, "epoch": 0.64306640625, "grad_norm": 1.8166865063096989, "kl": 0.0635986328125, "learning_rate": 8.392333984375e-07, "loss": 0.0025, "reward": 1.6987890005111694, "reward_std": 0.06034014839679003, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6987889111042023, "step": 1317 }, { "clip_ratio": 0.0, "completion_length": 349.046875, "epoch": 0.6435546875, "grad_norm": 1.4887931508827008, "kl": 0.0628662109375, "learning_rate": 8.39111328125e-07, "loss": 0.0025, "reward": 1.7936866283416748, "reward_std": 0.04510762542486191, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7936865985393524, "step": 1318 }, { "clip_ratio": 0.0, "completion_length": 326.3671875, "epoch": 0.64404296875, "grad_norm": 1.2967345838531843, "kl": 0.050537109375, "learning_rate": 8.389892578125e-07, "loss": 0.002, "reward": 1.83639657497406, "reward_std": 0.06598273664712906, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8363966047763824, "step": 1319 }, { "clip_ratio": 0.0, "completion_length": 253.5625, "epoch": 0.64453125, "grad_norm": 1.8104292750631013, "kl": 0.067626953125, "learning_rate": 8.388671875e-07, "loss": 0.0027, "reward": 1.7048075199127197, "reward_std": 0.05657285824418068, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.704807460308075, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 333.359375, "epoch": 0.64501953125, "grad_norm": 2.7804914867197734, "kl": 0.0479736328125, "learning_rate": 8.387451171874999e-07, "loss": 0.0019, "reward": 1.791284441947937, "reward_std": 0.04943067207932472, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.791284441947937, "step": 1321 }, { "clip_ratio": 0.0, "completion_length": 392.2578125, "epoch": 0.6455078125, "grad_norm": 1.5521266813840282, "kl": 0.0557861328125, "learning_rate": 8.386230468749999e-07, "loss": 0.0022, "reward": 1.6612219214439392, "reward_std": 0.06563910469412804, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6612218618392944, "step": 1322 }, { "clip_ratio": 0.0, "completion_length": 268.65625, "epoch": 0.64599609375, "grad_norm": 1.6124206023884073, "kl": 0.056396484375, "learning_rate": 8.385009765625e-07, "loss": 0.0023, "reward": 1.753632366657257, "reward_std": 0.051005132496356964, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7536323666572571, "step": 1323 }, { "clip_ratio": 0.0, "completion_length": 302.9765625, "epoch": 0.646484375, "grad_norm": 2.247706598191221, "kl": 0.054443359375, "learning_rate": 8.3837890625e-07, "loss": 0.0022, "reward": 1.7354570031166077, "reward_std": 0.16325188055634499, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7510820627212524, "step": 1324 }, { "clip_ratio": 0.0, "completion_length": 255.796875, "epoch": 0.64697265625, "grad_norm": 11.389486659767021, "kl": 0.0579833984375, "learning_rate": 8.382568359375e-07, "loss": 0.0023, "reward": 1.7502111196517944, "reward_std": 0.0659194141626358, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7502111196517944, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 305.5390625, "epoch": 0.6474609375, "grad_norm": 1.2834360713798052, "kl": 0.0615234375, "learning_rate": 8.38134765625e-07, "loss": 0.0025, "reward": 1.69877290725708, "reward_std": 0.052922509610652924, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6987729072570801, "step": 1326 }, { "clip_ratio": 0.0, "completion_length": 315.453125, "epoch": 0.64794921875, "grad_norm": 1.2944757450752098, "kl": 0.058837890625, "learning_rate": 8.380126953125e-07, "loss": 0.0024, "reward": 1.7563305497169495, "reward_std": 0.08665376901626587, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7563305497169495, "step": 1327 }, { "clip_ratio": 0.0, "completion_length": 291.6796875, "epoch": 0.6484375, "grad_norm": 6.608818467370714, "kl": 0.0654296875, "learning_rate": 8.378906249999999e-07, "loss": 0.0026, "reward": 1.6104682683944702, "reward_std": 0.08941986411809921, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6104682385921478, "step": 1328 }, { "clip_ratio": 0.0, "completion_length": 261.015625, "epoch": 0.64892578125, "grad_norm": 1.7084744934823917, "kl": 0.06494140625, "learning_rate": 8.377685546874999e-07, "loss": 0.0026, "reward": 1.7628703117370605, "reward_std": 0.044559099711477757, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7628703117370605, "step": 1329 }, { "clip_ratio": 0.0, "completion_length": 290.265625, "epoch": 0.6494140625, "grad_norm": 1.0774403862289603, "kl": 0.054443359375, "learning_rate": 8.37646484375e-07, "loss": 0.0022, "reward": 1.8073540329933167, "reward_std": 0.07767094019800425, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8151665329933167, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 281.7421875, "epoch": 0.64990234375, "grad_norm": 1.8398660445850628, "kl": 0.0673828125, "learning_rate": 8.375244140625e-07, "loss": 0.0027, "reward": 1.6594313383102417, "reward_std": 0.10600551217794418, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6672438383102417, "step": 1331 }, { "clip_ratio": 0.0, "completion_length": 220.0859375, "epoch": 0.650390625, "grad_norm": 3.934324761283697, "kl": 0.08056640625, "learning_rate": 8.3740234375e-07, "loss": 0.0032, "reward": 1.6837170720100403, "reward_std": 0.029794931411743164, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6837171018123627, "step": 1332 }, { "clip_ratio": 0.0, "completion_length": 308.984375, "epoch": 0.65087890625, "grad_norm": 13.797314264854801, "kl": 0.062744140625, "learning_rate": 8.372802734375e-07, "loss": 0.0025, "reward": 1.8004740476608276, "reward_std": 0.053687095642089844, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8004740178585052, "step": 1333 }, { "clip_ratio": 0.0, "completion_length": 236.578125, "epoch": 0.6513671875, "grad_norm": 1.1330439181277785, "kl": 0.04736328125, "learning_rate": 8.371582031249999e-07, "loss": 0.0019, "reward": 1.6666799783706665, "reward_std": 0.10937470942735672, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6744924187660217, "step": 1334 }, { "clip_ratio": 0.0, "completion_length": 287.859375, "epoch": 0.65185546875, "grad_norm": 1.4553589678453058, "kl": 0.0582275390625, "learning_rate": 8.370361328124999e-07, "loss": 0.0023, "reward": 1.6906811594963074, "reward_std": 0.04443385824561119, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6906810998916626, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 333.890625, "epoch": 0.65234375, "grad_norm": 1.302159303083921, "kl": 0.048095703125, "learning_rate": 8.369140625e-07, "loss": 0.0019, "reward": 1.7355791926383972, "reward_std": 0.1092800498008728, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7512041926383972, "step": 1336 }, { "clip_ratio": 0.0, "completion_length": 288.7109375, "epoch": 0.65283203125, "grad_norm": 17.77091921210839, "kl": 0.0572509765625, "learning_rate": 8.367919921875e-07, "loss": 0.0023, "reward": 1.644793450832367, "reward_std": 0.09384549781680107, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6526058912277222, "step": 1337 }, { "clip_ratio": 0.0, "completion_length": 289.3046875, "epoch": 0.6533203125, "grad_norm": 0.9214814360203658, "kl": 0.0531005859375, "learning_rate": 8.36669921875e-07, "loss": 0.0021, "reward": 1.7342381477355957, "reward_std": 0.03282667603343725, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7342380881309509, "step": 1338 }, { "clip_ratio": 0.0, "completion_length": 333.3359375, "epoch": 0.65380859375, "grad_norm": 1.2792896824125273, "kl": 0.0487060546875, "learning_rate": 8.365478515625e-07, "loss": 0.0019, "reward": 1.8167948126792908, "reward_std": 0.05022166669368744, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8167948722839355, "step": 1339 }, { "clip_ratio": 0.0, "completion_length": 234.703125, "epoch": 0.654296875, "grad_norm": 2.0308611626495954, "kl": 0.05078125, "learning_rate": 8.3642578125e-07, "loss": 0.002, "reward": 1.793544888496399, "reward_std": 0.05571754090487957, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7935448884963989, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 353.4609375, "epoch": 0.65478515625, "grad_norm": 2.8603926926077285, "kl": 0.0635986328125, "learning_rate": 8.363037109374999e-07, "loss": 0.0025, "reward": 1.7558925151824951, "reward_std": 0.09720181487500668, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7558925449848175, "step": 1341 }, { "clip_ratio": 0.0, "completion_length": 303.359375, "epoch": 0.6552734375, "grad_norm": 1.7594024320160508, "kl": 0.049072265625, "learning_rate": 8.361816406249999e-07, "loss": 0.002, "reward": 1.7895704507827759, "reward_std": 0.060751235112547874, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7895704209804535, "step": 1342 }, { "clip_ratio": 0.0, "completion_length": 273.984375, "epoch": 0.65576171875, "grad_norm": 2.025592104259594, "kl": 0.0482177734375, "learning_rate": 8.360595703125e-07, "loss": 0.0019, "reward": 1.7954835891723633, "reward_std": 0.11320845782756805, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8032960891723633, "step": 1343 }, { "clip_ratio": 0.0, "completion_length": 299.828125, "epoch": 0.65625, "grad_norm": 2.5560286885728267, "kl": 0.05908203125, "learning_rate": 8.359375e-07, "loss": 0.0024, "reward": 1.6292362213134766, "reward_std": 0.13106617331504822, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.637048751115799, "step": 1344 }, { "clip_ratio": 0.0, "completion_length": 338.296875, "epoch": 0.65673828125, "grad_norm": 2.542594283261598, "kl": 0.0533447265625, "learning_rate": 8.358154296875e-07, "loss": 0.0021, "reward": 1.7747212648391724, "reward_std": 0.08434459567070007, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7747212648391724, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 306.21875, "epoch": 0.6572265625, "grad_norm": 1.7311059727406346, "kl": 0.056396484375, "learning_rate": 8.35693359375e-07, "loss": 0.0023, "reward": 1.7278847694396973, "reward_std": 0.06932184100151062, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7278847694396973, "step": 1346 }, { "clip_ratio": 0.0, "completion_length": 299.7265625, "epoch": 0.65771484375, "grad_norm": 0.8011187117027543, "kl": 0.0498046875, "learning_rate": 8.355712890624999e-07, "loss": 0.002, "reward": 1.7831463813781738, "reward_std": 0.018837594892829657, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7831464111804962, "step": 1347 }, { "clip_ratio": 0.0, "completion_length": 274.265625, "epoch": 0.658203125, "grad_norm": 1.535578949505016, "kl": 0.0528564453125, "learning_rate": 8.354492187499999e-07, "loss": 0.0021, "reward": 1.7664051055908203, "reward_std": 0.0737844929099083, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7664050757884979, "step": 1348 }, { "clip_ratio": 0.0, "completion_length": 235.421875, "epoch": 0.65869140625, "grad_norm": 2.622932104029845, "kl": 0.0594482421875, "learning_rate": 8.353271484374999e-07, "loss": 0.0024, "reward": 1.6524591445922852, "reward_std": 0.0812476146966219, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6524590253829956, "step": 1349 }, { "clip_ratio": 0.0, "completion_length": 344.984375, "epoch": 0.6591796875, "grad_norm": 1.1970666562029746, "kl": 0.060302734375, "learning_rate": 8.35205078125e-07, "loss": 0.0024, "reward": 1.6671356558799744, "reward_std": 0.09243928454816341, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6827606558799744, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 353.109375, "epoch": 0.65966796875, "grad_norm": 2.009394767469386, "kl": 0.0521240234375, "learning_rate": 8.350830078125e-07, "loss": 0.0021, "reward": 1.629963755607605, "reward_std": 0.10920102149248123, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.637776255607605, "step": 1351 }, { "clip_ratio": 0.0, "completion_length": 308.5859375, "epoch": 0.66015625, "grad_norm": 2.3653616861770193, "kl": 0.0565185546875, "learning_rate": 8.349609375e-07, "loss": 0.0023, "reward": 1.7094528079032898, "reward_std": 0.07993777468800545, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7172653079032898, "step": 1352 }, { "clip_ratio": 0.0, "completion_length": 294.7265625, "epoch": 0.66064453125, "grad_norm": 1.03341094245473, "kl": 0.04931640625, "learning_rate": 8.348388671875e-07, "loss": 0.002, "reward": 1.8251619338989258, "reward_std": 0.03396361041814089, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8251619935035706, "step": 1353 }, { "clip_ratio": 0.0, "completion_length": 274.6953125, "epoch": 0.6611328125, "grad_norm": 3.4116771293429715, "kl": 0.06494140625, "learning_rate": 8.347167968749999e-07, "loss": 0.0026, "reward": 1.6079095005989075, "reward_std": 0.1288512572646141, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6157219111919403, "step": 1354 }, { "clip_ratio": 0.0, "completion_length": 429.8203125, "epoch": 0.66162109375, "grad_norm": 1.5812188323632634, "kl": 0.04443359375, "learning_rate": 8.345947265624999e-07, "loss": 0.0018, "reward": 1.760659396648407, "reward_std": 0.10924211144447327, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7684718370437622, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 392.6015625, "epoch": 0.662109375, "grad_norm": 1.3464559463319818, "kl": 0.0501708984375, "learning_rate": 8.3447265625e-07, "loss": 0.002, "reward": 1.6251134872436523, "reward_std": 0.19322798028588295, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.65636345744133, "step": 1356 }, { "clip_ratio": 0.0, "completion_length": 367.7734375, "epoch": 0.66259765625, "grad_norm": 2.458918621457383, "kl": 0.0506591796875, "learning_rate": 8.343505859375e-07, "loss": 0.002, "reward": 1.6254653930664062, "reward_std": 0.20456621050834656, "rewards/format_reward": 0.90625, "rewards/ocr_reward": 0.719215452671051, "step": 1357 }, { "clip_ratio": 0.0, "completion_length": 262.5625, "epoch": 0.6630859375, "grad_norm": 1.2995869911698037, "kl": 0.0513916015625, "learning_rate": 8.34228515625e-07, "loss": 0.0021, "reward": 1.7724117040634155, "reward_std": 0.05441422015428543, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7724116742610931, "step": 1358 }, { "clip_ratio": 0.0, "completion_length": 375.9765625, "epoch": 0.66357421875, "grad_norm": 3.3800856344759063, "kl": 0.055908203125, "learning_rate": 8.341064453125e-07, "loss": 0.0022, "reward": 1.7442671656608582, "reward_std": 0.14173447713255882, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7677046656608582, "step": 1359 }, { "clip_ratio": 0.0, "completion_length": 342.1484375, "epoch": 0.6640625, "grad_norm": 2.559332784444127, "kl": 0.0445556640625, "learning_rate": 8.339843749999999e-07, "loss": 0.0018, "reward": 1.716669738292694, "reward_std": 0.11901552230119705, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7322947978973389, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 297.25, "epoch": 0.66455078125, "grad_norm": 1.4593259935506697, "kl": 0.05419921875, "learning_rate": 8.338623046874999e-07, "loss": 0.0022, "reward": 1.7508392333984375, "reward_std": 0.02747677080333233, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7508392930030823, "step": 1361 }, { "clip_ratio": 0.0, "completion_length": 328.84375, "epoch": 0.6650390625, "grad_norm": 1.2002979458589371, "kl": 0.0438232421875, "learning_rate": 8.337402343749999e-07, "loss": 0.0018, "reward": 1.81014883518219, "reward_std": 0.036064352840185165, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8101487755775452, "step": 1362 }, { "clip_ratio": 0.0, "completion_length": 316.1875, "epoch": 0.66552734375, "grad_norm": 2.6295827835981287, "kl": 0.07568359375, "learning_rate": 8.336181640625e-07, "loss": 0.003, "reward": 1.706138789653778, "reward_std": 0.1014098059386015, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7217637896537781, "step": 1363 }, { "clip_ratio": 0.0, "completion_length": 254.4296875, "epoch": 0.666015625, "grad_norm": 3.4631557074514765, "kl": 0.0574951171875, "learning_rate": 8.3349609375e-07, "loss": 0.0023, "reward": 1.6869778037071228, "reward_std": 0.1357739120721817, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6869778335094452, "step": 1364 }, { "clip_ratio": 0.0, "completion_length": 378.296875, "epoch": 0.66650390625, "grad_norm": 1.5320217928751798, "kl": 0.044677734375, "learning_rate": 8.333740234375e-07, "loss": 0.0018, "reward": 1.6233786344528198, "reward_std": 0.08880849182605743, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6780661046504974, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 355.3984375, "epoch": 0.6669921875, "grad_norm": 3.0485251313362323, "kl": 0.049072265625, "learning_rate": 8.33251953125e-07, "loss": 0.002, "reward": 1.7159647345542908, "reward_std": 0.04883173480629921, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7159647643566132, "step": 1366 }, { "clip_ratio": 0.0, "completion_length": 395.9375, "epoch": 0.66748046875, "grad_norm": 3.6031654217252314, "kl": 0.0576171875, "learning_rate": 8.331298828124999e-07, "loss": 0.0023, "reward": 1.7141339778900146, "reward_std": 0.1232108511030674, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7297589182853699, "step": 1367 }, { "clip_ratio": 0.0, "completion_length": 302.6484375, "epoch": 0.66796875, "grad_norm": 2.3747920815803014, "kl": 0.0528564453125, "learning_rate": 8.330078124999999e-07, "loss": 0.0021, "reward": 1.63826584815979, "reward_std": 0.07934099994599819, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6460783183574677, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 340.859375, "epoch": 0.66845703125, "grad_norm": 1.0962500762337954, "kl": 0.05908203125, "learning_rate": 8.328857421875e-07, "loss": 0.0024, "reward": 1.6661372780799866, "reward_std": 0.14820329658687115, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.697387307882309, "step": 1369 }, { "clip_ratio": 0.0, "completion_length": 339.5078125, "epoch": 0.6689453125, "grad_norm": 12.019208496709846, "kl": 0.053466796875, "learning_rate": 8.32763671875e-07, "loss": 0.0021, "reward": 1.6565269231796265, "reward_std": 0.11510607227683067, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6721519827842712, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 354.03125, "epoch": 0.66943359375, "grad_norm": 4.559375150848394, "kl": 0.0618896484375, "learning_rate": 8.326416015625e-07, "loss": 0.0025, "reward": 1.6015617847442627, "reward_std": 0.11551137268543243, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6249993145465851, "step": 1371 }, { "clip_ratio": 0.0, "completion_length": 294.6328125, "epoch": 0.669921875, "grad_norm": 1.3152796356639234, "kl": 0.0606689453125, "learning_rate": 8.3251953125e-07, "loss": 0.0024, "reward": 1.7060803174972534, "reward_std": 0.08381591830402613, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7217053174972534, "step": 1372 }, { "clip_ratio": 0.0, "completion_length": 263.1796875, "epoch": 0.67041015625, "grad_norm": 2.0534328187220456, "kl": 0.0609130859375, "learning_rate": 8.323974609374999e-07, "loss": 0.0024, "reward": 1.726146936416626, "reward_std": 0.033384598791599274, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.726146936416626, "step": 1373 }, { "clip_ratio": 0.0, "completion_length": 295.5078125, "epoch": 0.6708984375, "grad_norm": 0.6956670519280784, "kl": 0.0565185546875, "learning_rate": 8.322753906249999e-07, "loss": 0.0023, "reward": 1.674700915813446, "reward_std": 0.08457869663834572, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.698138415813446, "step": 1374 }, { "clip_ratio": 0.0, "completion_length": 276.4375, "epoch": 0.67138671875, "grad_norm": 1.8185143553728975, "kl": 0.0467529296875, "learning_rate": 8.321533203124999e-07, "loss": 0.0019, "reward": 1.8650157451629639, "reward_std": 0.054776063188910484, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8650156855583191, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 353.65625, "epoch": 0.671875, "grad_norm": 0.7337982459358227, "kl": 0.03857421875, "learning_rate": 8.3203125e-07, "loss": 0.0015, "reward": 1.7965154647827148, "reward_std": 0.038863107562065125, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7965154945850372, "step": 1376 }, { "clip_ratio": 0.0, "completion_length": 287.0, "epoch": 0.67236328125, "grad_norm": 0.940234983069783, "kl": 0.0484619140625, "learning_rate": 8.319091796875e-07, "loss": 0.0019, "reward": 1.7178753018379211, "reward_std": 0.03893340937793255, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7178753018379211, "step": 1377 }, { "clip_ratio": 0.0, "completion_length": 348.5625, "epoch": 0.6728515625, "grad_norm": 3.0895577377191903, "kl": 0.0479736328125, "learning_rate": 8.31787109375e-07, "loss": 0.0019, "reward": 1.6909393668174744, "reward_std": 0.03531087189912796, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6909393668174744, "step": 1378 }, { "clip_ratio": 0.0, "completion_length": 293.78125, "epoch": 0.67333984375, "grad_norm": 5.314939113631747, "kl": 0.0555419921875, "learning_rate": 8.316650390625e-07, "loss": 0.0022, "reward": 1.8328039646148682, "reward_std": 0.18444261699914932, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8406165242195129, "step": 1379 }, { "clip_ratio": 0.0, "completion_length": 294.2578125, "epoch": 0.673828125, "grad_norm": 2.773926019270338, "kl": 0.0574951171875, "learning_rate": 8.315429687499999e-07, "loss": 0.0023, "reward": 1.754637897014618, "reward_std": 0.07902231067419052, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7546378672122955, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 268.3125, "epoch": 0.67431640625, "grad_norm": 2.5991565514669133, "kl": 0.05126953125, "learning_rate": 8.314208984374999e-07, "loss": 0.002, "reward": 1.697754681110382, "reward_std": 0.10455015674233437, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7055672109127045, "step": 1381 }, { "clip_ratio": 0.0, "completion_length": 268.9609375, "epoch": 0.6748046875, "grad_norm": 1.523278355501712, "kl": 0.0596923828125, "learning_rate": 8.31298828125e-07, "loss": 0.0024, "reward": 1.7877840995788574, "reward_std": 0.10126758366823196, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7877840399742126, "step": 1382 }, { "clip_ratio": 0.0, "completion_length": 265.90625, "epoch": 0.67529296875, "grad_norm": 1.7215147395484285, "kl": 0.0465087890625, "learning_rate": 8.311767578125e-07, "loss": 0.0019, "reward": 1.7264689207077026, "reward_std": 0.0403452143073082, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.726468950510025, "step": 1383 }, { "clip_ratio": 0.0, "completion_length": 356.3125, "epoch": 0.67578125, "grad_norm": 8.257297851754577, "kl": 0.054931640625, "learning_rate": 8.310546875e-07, "loss": 0.0022, "reward": 1.7433820962905884, "reward_std": 0.18031665682792664, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.774632066488266, "step": 1384 }, { "clip_ratio": 0.0, "completion_length": 375.0859375, "epoch": 0.67626953125, "grad_norm": 1.0654545701165232, "kl": 0.0400390625, "learning_rate": 8.309326171875e-07, "loss": 0.0016, "reward": 1.7454291582107544, "reward_std": 0.0993618592619896, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7610541582107544, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 411.7109375, "epoch": 0.6767578125, "grad_norm": 1.040028404639345, "kl": 0.0380859375, "learning_rate": 8.308105468749999e-07, "loss": 0.0015, "reward": 1.760416865348816, "reward_std": 0.0644846223294735, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7604168355464935, "step": 1386 }, { "clip_ratio": 0.0, "completion_length": 258.84375, "epoch": 0.67724609375, "grad_norm": 3.9520714635227354, "kl": 0.05078125, "learning_rate": 8.306884765624999e-07, "loss": 0.002, "reward": 1.7678569555282593, "reward_std": 0.04989023134112358, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.767857015132904, "step": 1387 }, { "clip_ratio": 0.0, "completion_length": 315.5703125, "epoch": 0.677734375, "grad_norm": 2.9876370676551103, "kl": 0.0631103515625, "learning_rate": 8.305664062499999e-07, "loss": 0.0025, "reward": 1.741922914981842, "reward_std": 0.04209707863628864, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.741922914981842, "step": 1388 }, { "clip_ratio": 0.0, "completion_length": 322.703125, "epoch": 0.67822265625, "grad_norm": 2.5716646270109456, "kl": 0.0513916015625, "learning_rate": 8.304443359375e-07, "loss": 0.0021, "reward": 1.6752015352249146, "reward_std": 0.0383878406137228, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6752015054225922, "step": 1389 }, { "clip_ratio": 0.0, "completion_length": 342.4609375, "epoch": 0.6787109375, "grad_norm": 3.5362926281368234, "kl": 0.046142578125, "learning_rate": 8.30322265625e-07, "loss": 0.0018, "reward": 1.8196292519569397, "reward_std": 0.11842495948076248, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8274418413639069, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 340.296875, "epoch": 0.67919921875, "grad_norm": 1.7884111867788441, "kl": 0.050537109375, "learning_rate": 8.302001953125e-07, "loss": 0.002, "reward": 1.6618223786354065, "reward_std": 0.07712319865822792, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6618224382400513, "step": 1391 }, { "clip_ratio": 0.0, "completion_length": 325.703125, "epoch": 0.6796875, "grad_norm": 2.6027744639630392, "kl": 0.0611572265625, "learning_rate": 8.30078125e-07, "loss": 0.0024, "reward": 1.5982427597045898, "reward_std": 0.08532186597585678, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5982428193092346, "step": 1392 }, { "clip_ratio": 0.0, "completion_length": 320.203125, "epoch": 0.68017578125, "grad_norm": 5.619667205772122, "kl": 0.0548095703125, "learning_rate": 8.299560546874999e-07, "loss": 0.0022, "reward": 1.825063407421112, "reward_std": 0.07088461332023144, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8328758478164673, "step": 1393 }, { "clip_ratio": 0.0, "completion_length": 302.09375, "epoch": 0.6806640625, "grad_norm": 4.094562358242973, "kl": 0.0579833984375, "learning_rate": 8.298339843749999e-07, "loss": 0.0023, "reward": 1.619062602519989, "reward_std": 0.0742390900850296, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.619062602519989, "step": 1394 }, { "clip_ratio": 0.0, "completion_length": 230.1875, "epoch": 0.68115234375, "grad_norm": 1.3270748091394964, "kl": 0.05126953125, "learning_rate": 8.297119140625e-07, "loss": 0.0021, "reward": 1.7332074046134949, "reward_std": 0.08660921268165112, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7488323748111725, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 257.9765625, "epoch": 0.681640625, "grad_norm": 2.4733721239899285, "kl": 0.064453125, "learning_rate": 8.2958984375e-07, "loss": 0.0026, "reward": 1.7022396922111511, "reward_std": 0.07674708962440491, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7022397220134735, "step": 1396 }, { "clip_ratio": 0.0, "completion_length": 351.8203125, "epoch": 0.68212890625, "grad_norm": 11.678019935504977, "kl": 0.0526123046875, "learning_rate": 8.294677734375e-07, "loss": 0.0021, "reward": 1.7020042538642883, "reward_std": 0.0902528464794159, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7020042836666107, "step": 1397 }, { "clip_ratio": 0.0, "completion_length": 369.46875, "epoch": 0.6826171875, "grad_norm": 1.9958555775047342, "kl": 0.0479736328125, "learning_rate": 8.29345703125e-07, "loss": 0.0019, "reward": 1.612401008605957, "reward_std": 0.11361010372638702, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6280259191989899, "step": 1398 }, { "clip_ratio": 0.0, "completion_length": 192.4609375, "epoch": 0.68310546875, "grad_norm": 5.611286013828831, "kl": 0.067626953125, "learning_rate": 8.292236328124999e-07, "loss": 0.0027, "reward": 1.6931315064430237, "reward_std": 0.0486298855394125, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6931315362453461, "step": 1399 }, { "clip_ratio": 0.0, "completion_length": 284.75, "epoch": 0.68359375, "grad_norm": 64.51783733112462, "kl": 0.068115234375, "learning_rate": 8.291015624999999e-07, "loss": 0.0027, "reward": 1.7985500693321228, "reward_std": 0.1456453576683998, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8219876885414124, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 325.5625, "epoch": 0.68408203125, "grad_norm": 1.4134996928177288, "kl": 0.0511474609375, "learning_rate": 8.289794921874999e-07, "loss": 0.002, "reward": 1.7496825456619263, "reward_std": 0.063983004540205, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7574950754642487, "step": 1401 }, { "clip_ratio": 0.0, "completion_length": 334.15625, "epoch": 0.6845703125, "grad_norm": 2.2409064158617453, "kl": 0.0560302734375, "learning_rate": 8.28857421875e-07, "loss": 0.0022, "reward": 1.5890800952911377, "reward_std": 0.07782717980444431, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5968926250934601, "step": 1402 }, { "clip_ratio": 0.0, "completion_length": 363.0078125, "epoch": 0.68505859375, "grad_norm": 1.0794027539847428, "kl": 0.0517578125, "learning_rate": 8.287353515625e-07, "loss": 0.0021, "reward": 1.7144798636436462, "reward_std": 0.08601437509059906, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7144798934459686, "step": 1403 }, { "clip_ratio": 0.0, "completion_length": 279.0546875, "epoch": 0.685546875, "grad_norm": 3.5803094144613534, "kl": 0.06982421875, "learning_rate": 8.2861328125e-07, "loss": 0.0028, "reward": 1.7153080701828003, "reward_std": 0.12694889307022095, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7231204807758331, "step": 1404 }, { "clip_ratio": 0.0, "completion_length": 264.25, "epoch": 0.68603515625, "grad_norm": 2.24836561629631, "kl": 0.07275390625, "learning_rate": 8.284912109375e-07, "loss": 0.0029, "reward": 1.707879662513733, "reward_std": 0.07356316037476063, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7078795731067657, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 305.0625, "epoch": 0.6865234375, "grad_norm": 5.0710615285595075, "kl": 0.0543212890625, "learning_rate": 8.283691406249999e-07, "loss": 0.0022, "reward": 1.8069834113121033, "reward_std": 0.08053146488964558, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8069833815097809, "step": 1406 }, { "clip_ratio": 0.0, "completion_length": 313.1484375, "epoch": 0.68701171875, "grad_norm": 1.4195625786352468, "kl": 0.0478515625, "learning_rate": 8.282470703124999e-07, "loss": 0.0019, "reward": 1.8579445481300354, "reward_std": 0.06742975115776062, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8657570779323578, "step": 1407 }, { "clip_ratio": 0.0, "completion_length": 277.71875, "epoch": 0.6875, "grad_norm": 1.2744757065801622, "kl": 0.049560546875, "learning_rate": 8.28125e-07, "loss": 0.002, "reward": 1.8057802319526672, "reward_std": 0.11631088703870773, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8214052319526672, "step": 1408 }, { "clip_ratio": 0.0, "completion_length": 304.703125, "epoch": 0.68798828125, "grad_norm": 1.8422456478028055, "kl": 0.064697265625, "learning_rate": 8.280029296875e-07, "loss": 0.0026, "reward": 1.7374829053878784, "reward_std": 0.10645648092031479, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7452954351902008, "step": 1409 }, { "clip_ratio": 0.0, "completion_length": 238.625, "epoch": 0.6884765625, "grad_norm": 1.4604901987015795, "kl": 0.0531005859375, "learning_rate": 8.27880859375e-07, "loss": 0.0021, "reward": 1.8335306644439697, "reward_std": 0.03559792507439852, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8335305750370026, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 341.5625, "epoch": 0.68896484375, "grad_norm": 2.0010636772477475, "kl": 0.0567626953125, "learning_rate": 8.277587890625e-07, "loss": 0.0023, "reward": 1.7226684093475342, "reward_std": 0.0325869033113122, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7226683795452118, "step": 1411 }, { "clip_ratio": 0.0, "completion_length": 352.5390625, "epoch": 0.689453125, "grad_norm": 4.139675114890247, "kl": 0.0548095703125, "learning_rate": 8.2763671875e-07, "loss": 0.0022, "reward": 1.688076138496399, "reward_std": 0.09778516367077827, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6880761086940765, "step": 1412 }, { "clip_ratio": 0.0, "completion_length": 310.7890625, "epoch": 0.68994140625, "grad_norm": 1.0825288165225206, "kl": 0.0504150390625, "learning_rate": 8.275146484374999e-07, "loss": 0.002, "reward": 1.7078955173492432, "reward_std": 0.10900576412677765, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7235205173492432, "step": 1413 }, { "clip_ratio": 0.0, "completion_length": 339.7421875, "epoch": 0.6904296875, "grad_norm": 1.048683826956388, "kl": 0.0548095703125, "learning_rate": 8.273925781249999e-07, "loss": 0.0022, "reward": 1.711770236492157, "reward_std": 0.14532910659909248, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.743020236492157, "step": 1414 }, { "clip_ratio": 0.0, "completion_length": 210.6796875, "epoch": 0.69091796875, "grad_norm": 3.6499422191186257, "kl": 0.079833984375, "learning_rate": 8.272705078125e-07, "loss": 0.0032, "reward": 1.7406352758407593, "reward_std": 0.07238475233316422, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7406352758407593, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 240.4765625, "epoch": 0.69140625, "grad_norm": 6.765975646442193, "kl": 0.057373046875, "learning_rate": 8.271484375e-07, "loss": 0.0023, "reward": 1.7682831287384033, "reward_std": 0.06250830553472042, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7682830989360809, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 261.4921875, "epoch": 0.69189453125, "grad_norm": 2.9978780027516225, "kl": 0.06787109375, "learning_rate": 8.270263671875e-07, "loss": 0.0027, "reward": 1.7275251150131226, "reward_std": 0.03290243726223707, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7275251746177673, "step": 1417 }, { "clip_ratio": 0.0, "completion_length": 310.8671875, "epoch": 0.6923828125, "grad_norm": 0.9989255723691591, "kl": 0.0592041015625, "learning_rate": 8.26904296875e-07, "loss": 0.0024, "reward": 1.6132749319076538, "reward_std": 0.14219776540994644, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6523374617099762, "step": 1418 }, { "clip_ratio": 0.0, "completion_length": 393.25, "epoch": 0.69287109375, "grad_norm": 1.733952595360354, "kl": 0.0599365234375, "learning_rate": 8.267822265624999e-07, "loss": 0.0024, "reward": 1.529246211051941, "reward_std": 0.21004829555749893, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.5683087408542633, "step": 1419 }, { "clip_ratio": 0.0, "completion_length": 363.4140625, "epoch": 0.693359375, "grad_norm": 1.4642430835196691, "kl": 0.06689453125, "learning_rate": 8.266601562499999e-07, "loss": 0.0027, "reward": 1.7179248332977295, "reward_std": 0.07703246548771858, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7179248332977295, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 301.890625, "epoch": 0.69384765625, "grad_norm": 0.9887674725498403, "kl": 0.0482177734375, "learning_rate": 8.265380859375e-07, "loss": 0.0019, "reward": 1.8166847229003906, "reward_std": 0.1480160653591156, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8323096930980682, "step": 1421 }, { "clip_ratio": 0.0, "completion_length": 319.015625, "epoch": 0.6943359375, "grad_norm": 2.2480619414400524, "kl": 0.06787109375, "learning_rate": 8.26416015625e-07, "loss": 0.0027, "reward": 1.640427827835083, "reward_std": 0.15429722517728806, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6560528576374054, "step": 1422 }, { "clip_ratio": 0.0, "completion_length": 321.796875, "epoch": 0.69482421875, "grad_norm": 1.6615444011731868, "kl": 0.0548095703125, "learning_rate": 8.262939453125e-07, "loss": 0.0022, "reward": 1.8311384916305542, "reward_std": 0.08753996156156063, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8389509916305542, "step": 1423 }, { "clip_ratio": 0.0, "completion_length": 303.15625, "epoch": 0.6953125, "grad_norm": 1.3951149530350935, "kl": 0.0712890625, "learning_rate": 8.26171875e-07, "loss": 0.0028, "reward": 1.7672319412231445, "reward_std": 0.1136610172688961, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7672319114208221, "step": 1424 }, { "clip_ratio": 0.0, "completion_length": 340.6796875, "epoch": 0.69580078125, "grad_norm": 1.1270021475821117, "kl": 0.043212890625, "learning_rate": 8.260498046875e-07, "loss": 0.0017, "reward": 1.7945731282234192, "reward_std": 0.10133310779929161, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8180106282234192, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 252.3515625, "epoch": 0.6962890625, "grad_norm": 1.0404208747858652, "kl": 0.0528564453125, "learning_rate": 8.259277343749999e-07, "loss": 0.0021, "reward": 1.8590461611747742, "reward_std": 0.06993057578802109, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8590461909770966, "step": 1426 }, { "clip_ratio": 0.0, "completion_length": 350.4140625, "epoch": 0.69677734375, "grad_norm": 1.5437058523157996, "kl": 0.0621337890625, "learning_rate": 8.258056640624999e-07, "loss": 0.0025, "reward": 1.622244954109192, "reward_std": 0.10384193528443575, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6378699541091919, "step": 1427 }, { "clip_ratio": 0.0, "completion_length": 272.0859375, "epoch": 0.697265625, "grad_norm": 1.7190179987884586, "kl": 0.0633544921875, "learning_rate": 8.2568359375e-07, "loss": 0.0025, "reward": 1.7580605745315552, "reward_std": 0.05973019078373909, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7580606043338776, "step": 1428 }, { "clip_ratio": 0.0, "completion_length": 294.4296875, "epoch": 0.69775390625, "grad_norm": 1.6679202767670824, "kl": 0.053955078125, "learning_rate": 8.255615234375e-07, "loss": 0.0022, "reward": 1.8018113374710083, "reward_std": 0.049905733205378056, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8018112778663635, "step": 1429 }, { "clip_ratio": 0.0, "completion_length": 349.078125, "epoch": 0.6982421875, "grad_norm": 1.2612943936774128, "kl": 0.0543212890625, "learning_rate": 8.25439453125e-07, "loss": 0.0022, "reward": 1.7309202551841736, "reward_std": 0.09569451212882996, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7387327551841736, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 279.1875, "epoch": 0.69873046875, "grad_norm": 1.6270467995733127, "kl": 0.06689453125, "learning_rate": 8.253173828125e-07, "loss": 0.0027, "reward": 1.8153039813041687, "reward_std": 0.0769112091511488, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8309289515018463, "step": 1431 }, { "clip_ratio": 0.0, "completion_length": 238.2265625, "epoch": 0.69921875, "grad_norm": 3.113671634396718, "kl": 0.05908203125, "learning_rate": 8.251953124999999e-07, "loss": 0.0024, "reward": 1.7506902813911438, "reward_std": 0.05832826718688011, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7506902813911438, "step": 1432 }, { "clip_ratio": 0.0, "completion_length": 263.28125, "epoch": 0.69970703125, "grad_norm": 1.5482948608367026, "kl": 0.052490234375, "learning_rate": 8.250732421874999e-07, "loss": 0.0021, "reward": 1.7495105266571045, "reward_std": 0.047743687871843576, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7495104968547821, "step": 1433 }, { "clip_ratio": 0.0, "completion_length": 337.703125, "epoch": 0.7001953125, "grad_norm": 3.575458871440732, "kl": 0.0628662109375, "learning_rate": 8.24951171875e-07, "loss": 0.0025, "reward": 1.7153109312057495, "reward_std": 0.10487351939082146, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7231234312057495, "step": 1434 }, { "clip_ratio": 0.0, "completion_length": 306.1015625, "epoch": 0.70068359375, "grad_norm": 0.9120133104435602, "kl": 0.05419921875, "learning_rate": 8.248291015625e-07, "loss": 0.0022, "reward": 1.70259028673172, "reward_std": 0.0484690060839057, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.70259028673172, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 253.8828125, "epoch": 0.701171875, "grad_norm": 2.4781810496741126, "kl": 0.0616455078125, "learning_rate": 8.2470703125e-07, "loss": 0.0025, "reward": 1.8051986694335938, "reward_std": 0.05168750695884228, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8051986396312714, "step": 1436 }, { "clip_ratio": 0.0, "completion_length": 251.0234375, "epoch": 0.70166015625, "grad_norm": 1.959151036800492, "kl": 0.0552978515625, "learning_rate": 8.245849609375e-07, "loss": 0.0022, "reward": 1.7046823501586914, "reward_std": 0.07530912198126316, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.704682320356369, "step": 1437 }, { "clip_ratio": 0.0, "completion_length": 267.8671875, "epoch": 0.7021484375, "grad_norm": 1.1853692300427936, "kl": 0.0565185546875, "learning_rate": 8.24462890625e-07, "loss": 0.0023, "reward": 1.7881206274032593, "reward_std": 0.07596137002110481, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7959331572055817, "step": 1438 }, { "clip_ratio": 0.0, "completion_length": 258.453125, "epoch": 0.70263671875, "grad_norm": 8.816391483092268, "kl": 0.4215087890625, "learning_rate": 8.243408203124999e-07, "loss": 0.0169, "reward": 1.803566336631775, "reward_std": 0.08476324006915092, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8113788068294525, "step": 1439 }, { "clip_ratio": 0.0, "completion_length": 203.2578125, "epoch": 0.703125, "grad_norm": 3.150118996033216, "kl": 0.079345703125, "learning_rate": 8.242187499999999e-07, "loss": 0.0032, "reward": 1.65059894323349, "reward_std": 0.1842002421617508, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.65059894323349, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 300.8125, "epoch": 0.70361328125, "grad_norm": 1.8365019943222014, "kl": 0.047119140625, "learning_rate": 8.240966796875e-07, "loss": 0.0019, "reward": 1.7905904650688171, "reward_std": 0.026320545002818108, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7905905246734619, "step": 1441 }, { "clip_ratio": 0.0, "completion_length": 273.65625, "epoch": 0.7041015625, "grad_norm": 10.324751382988245, "kl": 0.0635986328125, "learning_rate": 8.23974609375e-07, "loss": 0.0025, "reward": 1.636955440044403, "reward_std": 0.16419149935245514, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6447679400444031, "step": 1442 }, { "clip_ratio": 0.0, "completion_length": 272.6953125, "epoch": 0.70458984375, "grad_norm": 1.5794799873389354, "kl": 0.0592041015625, "learning_rate": 8.238525390625e-07, "loss": 0.0024, "reward": 1.7604875564575195, "reward_std": 0.09106075949966908, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7604875266551971, "step": 1443 }, { "clip_ratio": 0.0, "completion_length": 304.96875, "epoch": 0.705078125, "grad_norm": 1.7122619531111243, "kl": 0.049560546875, "learning_rate": 8.2373046875e-07, "loss": 0.002, "reward": 1.7385912537574768, "reward_std": 0.15688905864953995, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7932787239551544, "step": 1444 }, { "clip_ratio": 0.0, "completion_length": 250.359375, "epoch": 0.70556640625, "grad_norm": 4.933104035300985, "kl": 0.0655517578125, "learning_rate": 8.236083984374999e-07, "loss": 0.0026, "reward": 1.6635677814483643, "reward_std": 0.3314622938632965, "rewards/format_reward": 0.859375, "rewards/ocr_reward": 0.8041927814483643, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 344.765625, "epoch": 0.7060546875, "grad_norm": 7.123204221199448, "kl": 0.0577392578125, "learning_rate": 8.234863281249999e-07, "loss": 0.0023, "reward": 1.5581657886505127, "reward_std": 0.23571809381246567, "rewards/format_reward": 0.8828125, "rewards/ocr_reward": 0.6753532588481903, "step": 1446 }, { "clip_ratio": 0.0, "completion_length": 349.203125, "epoch": 0.70654296875, "grad_norm": 1.701577139579483, "kl": 0.052490234375, "learning_rate": 8.233642578125e-07, "loss": 0.0021, "reward": 1.5181033611297607, "reward_std": 0.24801450222730637, "rewards/format_reward": 0.90625, "rewards/ocr_reward": 0.6118534803390503, "step": 1447 }, { "clip_ratio": 0.0, "completion_length": 324.609375, "epoch": 0.70703125, "grad_norm": 1.7700024228747016, "kl": 0.0506591796875, "learning_rate": 8.232421875e-07, "loss": 0.002, "reward": 1.7385361194610596, "reward_std": 0.1337948441505432, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7775986790657043, "step": 1448 }, { "clip_ratio": 0.0, "completion_length": 342.0859375, "epoch": 0.70751953125, "grad_norm": 0.9369244908723074, "kl": 0.0423583984375, "learning_rate": 8.231201171875e-07, "loss": 0.0017, "reward": 1.7505207657814026, "reward_std": 0.17919845134019852, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7739582657814026, "step": 1449 }, { "clip_ratio": 0.0, "completion_length": 315.28125, "epoch": 0.7080078125, "grad_norm": 1.7043675709930337, "kl": 0.0504150390625, "learning_rate": 8.22998046875e-07, "loss": 0.002, "reward": 1.778750240802765, "reward_std": 0.11969216167926788, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7865626811981201, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 303.0234375, "epoch": 0.70849609375, "grad_norm": 3.7096740843912497, "kl": 0.075439453125, "learning_rate": 8.228759765625e-07, "loss": 0.003, "reward": 1.6811388731002808, "reward_std": 0.16221491992473602, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7123888731002808, "step": 1451 }, { "clip_ratio": 0.0, "completion_length": 355.7890625, "epoch": 0.708984375, "grad_norm": 1.1016182213271797, "kl": 0.0509033203125, "learning_rate": 8.227539062499999e-07, "loss": 0.002, "reward": 1.730841338634491, "reward_std": 0.043069666251540184, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7308413684368134, "step": 1452 }, { "clip_ratio": 0.0, "completion_length": 280.8046875, "epoch": 0.70947265625, "grad_norm": 4.52514586838674, "kl": 0.0596923828125, "learning_rate": 8.226318359374999e-07, "loss": 0.0024, "reward": 1.6379446983337402, "reward_std": 0.09933317825198174, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6379446387290955, "step": 1453 }, { "clip_ratio": 0.0, "completion_length": 340.078125, "epoch": 0.7099609375, "grad_norm": 1.5113110300717163, "kl": 0.0423583984375, "learning_rate": 8.22509765625e-07, "loss": 0.0017, "reward": 1.743843913078308, "reward_std": 0.09296439960598946, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7750938832759857, "step": 1454 }, { "clip_ratio": 0.0, "completion_length": 301.2890625, "epoch": 0.71044921875, "grad_norm": 2.5506627271636586, "kl": 0.0540771484375, "learning_rate": 8.223876953125e-07, "loss": 0.0022, "reward": 1.8323869109153748, "reward_std": 0.10052505135536194, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8401993811130524, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 365.2421875, "epoch": 0.7109375, "grad_norm": 3.1270600079448694, "kl": 0.0478515625, "learning_rate": 8.22265625e-07, "loss": 0.0019, "reward": 1.7445816397666931, "reward_std": 0.09944414719939232, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7680192291736603, "step": 1456 }, { "clip_ratio": 0.0, "completion_length": 317.7578125, "epoch": 0.71142578125, "grad_norm": 2.430403544666908, "kl": 0.0513916015625, "learning_rate": 8.221435546875e-07, "loss": 0.0021, "reward": 1.7487914562225342, "reward_std": 0.07037571631371975, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7487914264202118, "step": 1457 }, { "clip_ratio": 0.0, "completion_length": 423.875, "epoch": 0.7119140625, "grad_norm": 2.5085519820373166, "kl": 0.0556640625, "learning_rate": 8.220214843749999e-07, "loss": 0.0022, "reward": 1.7057527303695679, "reward_std": 0.13188474997878075, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7291902005672455, "step": 1458 }, { "clip_ratio": 0.0, "completion_length": 308.3515625, "epoch": 0.71240234375, "grad_norm": 2.925211143069262, "kl": 0.0548095703125, "learning_rate": 8.218994140624999e-07, "loss": 0.0022, "reward": 1.6985459327697754, "reward_std": 0.09991316497325897, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7063583731651306, "step": 1459 }, { "clip_ratio": 0.0, "completion_length": 365.0703125, "epoch": 0.712890625, "grad_norm": 0.8890589616956593, "kl": 0.0518798828125, "learning_rate": 8.217773437499999e-07, "loss": 0.0021, "reward": 1.8091995120048523, "reward_std": 0.03047786932438612, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8091995716094971, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 416.828125, "epoch": 0.71337890625, "grad_norm": 19.189378131745773, "kl": 0.055419921875, "learning_rate": 8.216552734375e-07, "loss": 0.0022, "reward": 1.7730653285980225, "reward_std": 0.07386335171759129, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7808778285980225, "step": 1461 }, { "clip_ratio": 0.0, "completion_length": 349.3046875, "epoch": 0.7138671875, "grad_norm": 2.5112538884341795, "kl": 0.051025390625, "learning_rate": 8.21533203125e-07, "loss": 0.002, "reward": 1.7594855427742004, "reward_std": 0.0999723095446825, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7751105725765228, "step": 1462 }, { "clip_ratio": 0.0, "completion_length": 431.09375, "epoch": 0.71435546875, "grad_norm": 1.8674018464220214, "kl": 0.0516357421875, "learning_rate": 8.214111328125e-07, "loss": 0.0021, "reward": 1.768634557723999, "reward_std": 0.15559392422437668, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7842595875263214, "step": 1463 }, { "clip_ratio": 0.0, "completion_length": 337.4921875, "epoch": 0.71484375, "grad_norm": 3.647003211757384, "kl": 0.05517578125, "learning_rate": 8.212890625e-07, "loss": 0.0022, "reward": 1.6912202835083008, "reward_std": 0.10662208870053291, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6912202537059784, "step": 1464 }, { "clip_ratio": 0.0, "completion_length": 364.1953125, "epoch": 0.71533203125, "grad_norm": 1.9989805075274047, "kl": 0.0462646484375, "learning_rate": 8.211669921874999e-07, "loss": 0.0019, "reward": 1.8012118935585022, "reward_std": 0.04404502548277378, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8012118637561798, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 277.46875, "epoch": 0.7158203125, "grad_norm": 1.920187943953283, "kl": 0.066650390625, "learning_rate": 8.210449218749999e-07, "loss": 0.0027, "reward": 1.689796507358551, "reward_std": 0.08326592482626438, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6897964179515839, "step": 1466 }, { "clip_ratio": 0.0, "completion_length": 235.328125, "epoch": 0.71630859375, "grad_norm": 1.177187426188002, "kl": 0.0546875, "learning_rate": 8.209228515625e-07, "loss": 0.0022, "reward": 1.8154129385948181, "reward_std": 0.05495606176555157, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8154129683971405, "step": 1467 }, { "clip_ratio": 0.0, "completion_length": 379.4296875, "epoch": 0.716796875, "grad_norm": 2.430884718722628, "kl": 0.0477294921875, "learning_rate": 8.2080078125e-07, "loss": 0.0019, "reward": 1.6982349157333374, "reward_std": 0.1398230344057083, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7138599455356598, "step": 1468 }, { "clip_ratio": 0.0, "completion_length": 465.640625, "epoch": 0.71728515625, "grad_norm": 17.02941879508463, "kl": 0.0479736328125, "learning_rate": 8.206787109375e-07, "loss": 0.0019, "reward": 1.6150004267692566, "reward_std": 0.21229281276464462, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6618753671646118, "step": 1469 }, { "clip_ratio": 0.0, "completion_length": 371.484375, "epoch": 0.7177734375, "grad_norm": 2.690446110295151, "kl": 0.052490234375, "learning_rate": 8.20556640625e-07, "loss": 0.0021, "reward": 1.6503348350524902, "reward_std": 0.1022627055644989, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6581473350524902, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 353.4375, "epoch": 0.71826171875, "grad_norm": 1.031554242337561, "kl": 0.052001953125, "learning_rate": 8.204345703124999e-07, "loss": 0.0021, "reward": 1.7112751603126526, "reward_std": 0.13028262928128242, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7347126603126526, "step": 1471 }, { "clip_ratio": 0.0, "completion_length": 281.0234375, "epoch": 0.71875, "grad_norm": 2.5851949705933013, "kl": 0.062744140625, "learning_rate": 8.203124999999999e-07, "loss": 0.0025, "reward": 1.6422898769378662, "reward_std": 0.12438905239105225, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6501024663448334, "step": 1472 }, { "clip_ratio": 0.0, "completion_length": 354.234375, "epoch": 0.71923828125, "grad_norm": 6.193724547415232, "kl": 0.078125, "learning_rate": 8.201904296874999e-07, "loss": 0.0031, "reward": 1.8023394346237183, "reward_std": 0.09207788482308388, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8023395538330078, "step": 1473 }, { "clip_ratio": 0.0, "completion_length": 303.5625, "epoch": 0.7197265625, "grad_norm": 1.2219439151470035, "kl": 0.0506591796875, "learning_rate": 8.20068359375e-07, "loss": 0.002, "reward": 1.777301549911499, "reward_std": 0.04901622235774994, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.785114049911499, "step": 1474 }, { "clip_ratio": 0.0, "completion_length": 261.9140625, "epoch": 0.72021484375, "grad_norm": 2.927095284762868, "kl": 0.0487060546875, "learning_rate": 8.199462890625e-07, "loss": 0.0019, "reward": 1.715105950832367, "reward_std": 0.12777616456151009, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7229184210300446, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 353.546875, "epoch": 0.720703125, "grad_norm": 3.516281392962193, "kl": 0.0496826171875, "learning_rate": 8.1982421875e-07, "loss": 0.002, "reward": 1.6445563435554504, "reward_std": 0.1418607532978058, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6601813733577728, "step": 1476 }, { "clip_ratio": 0.0, "completion_length": 334.234375, "epoch": 0.72119140625, "grad_norm": 1.3865604707622783, "kl": 0.0531005859375, "learning_rate": 8.197021484375e-07, "loss": 0.0021, "reward": 1.711554229259491, "reward_std": 0.08468777127563953, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7193666994571686, "step": 1477 }, { "clip_ratio": 0.0, "completion_length": 311.078125, "epoch": 0.7216796875, "grad_norm": 1.345200066011625, "kl": 0.065673828125, "learning_rate": 8.195800781249999e-07, "loss": 0.0026, "reward": 1.7353711128234863, "reward_std": 0.13099960051476955, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7588086724281311, "step": 1478 }, { "clip_ratio": 0.0, "completion_length": 324.4765625, "epoch": 0.72216796875, "grad_norm": 1.584912796001927, "kl": 0.056884765625, "learning_rate": 8.194580078124999e-07, "loss": 0.0023, "reward": 1.76387220621109, "reward_std": 0.11961934715509415, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7638722062110901, "step": 1479 }, { "clip_ratio": 0.0, "completion_length": 334.9140625, "epoch": 0.72265625, "grad_norm": 1.7559205041484878, "kl": 0.0538330078125, "learning_rate": 8.193359375e-07, "loss": 0.0022, "reward": 1.7752057313919067, "reward_std": 0.05388793349266052, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7752057611942291, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 287.1328125, "epoch": 0.72314453125, "grad_norm": 3.609119053953651, "kl": 0.0548095703125, "learning_rate": 8.192138671875e-07, "loss": 0.0022, "reward": 1.7668498158454895, "reward_std": 0.05071160942316055, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7668498754501343, "step": 1481 }, { "clip_ratio": 0.0, "completion_length": 373.0390625, "epoch": 0.7236328125, "grad_norm": 2.9702748681991893, "kl": 0.0516357421875, "learning_rate": 8.19091796875e-07, "loss": 0.0021, "reward": 1.7055724263191223, "reward_std": 0.14055679365992546, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7211975157260895, "step": 1482 }, { "clip_ratio": 0.0, "completion_length": 367.5234375, "epoch": 0.72412109375, "grad_norm": 2.6429417488503884, "kl": 0.045166015625, "learning_rate": 8.189697265625e-07, "loss": 0.0018, "reward": 1.8638358116149902, "reward_std": 0.09294469654560089, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8638357818126678, "step": 1483 }, { "clip_ratio": 0.0, "completion_length": 280.1015625, "epoch": 0.724609375, "grad_norm": 1.1890261444927173, "kl": 0.0614013671875, "learning_rate": 8.188476562499999e-07, "loss": 0.0025, "reward": 1.6774348616600037, "reward_std": 0.11033051460981369, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6852473616600037, "step": 1484 }, { "clip_ratio": 0.0, "completion_length": 297.4375, "epoch": 0.72509765625, "grad_norm": 1.8392107416740417, "kl": 0.061279296875, "learning_rate": 8.187255859374999e-07, "loss": 0.0025, "reward": 1.6773231625556946, "reward_std": 0.12323976308107376, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6851356625556946, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 353.390625, "epoch": 0.7255859375, "grad_norm": 2.4085650228970623, "kl": 0.06103515625, "learning_rate": 8.186035156249999e-07, "loss": 0.0024, "reward": 1.7235342264175415, "reward_std": 0.16048508323729038, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7469716966152191, "step": 1486 }, { "clip_ratio": 0.0, "completion_length": 329.8671875, "epoch": 0.72607421875, "grad_norm": 2.408009729634376, "kl": 0.0439453125, "learning_rate": 8.184814453125e-07, "loss": 0.0018, "reward": 1.7089346051216125, "reward_std": 0.0976857841014862, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7089346349239349, "step": 1487 }, { "clip_ratio": 0.0, "completion_length": 264.7421875, "epoch": 0.7265625, "grad_norm": 1.6142248756995168, "kl": 0.0546875, "learning_rate": 8.18359375e-07, "loss": 0.0022, "reward": 1.774111568927765, "reward_std": 0.09651333093643188, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7819240987300873, "step": 1488 }, { "clip_ratio": 0.0, "completion_length": 400.3515625, "epoch": 0.72705078125, "grad_norm": 1.8517869550068955, "kl": 0.0472412109375, "learning_rate": 8.182373046875e-07, "loss": 0.0019, "reward": 1.7990041971206665, "reward_std": 0.14311717450618744, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8146291375160217, "step": 1489 }, { "clip_ratio": 0.0, "completion_length": 312.8984375, "epoch": 0.7275390625, "grad_norm": 2.7862564678417985, "kl": 0.04931640625, "learning_rate": 8.18115234375e-07, "loss": 0.002, "reward": 1.8134875893592834, "reward_std": 0.046054454520344734, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8134876191616058, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 281.5546875, "epoch": 0.72802734375, "grad_norm": 1.0550017725621663, "kl": 0.060791015625, "learning_rate": 8.179931640624999e-07, "loss": 0.0024, "reward": 1.8162503838539124, "reward_std": 0.043524582870304585, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8162504434585571, "step": 1491 }, { "clip_ratio": 0.0, "completion_length": 254.5703125, "epoch": 0.728515625, "grad_norm": 2.833553892053886, "kl": 0.071044921875, "learning_rate": 8.178710937499999e-07, "loss": 0.0028, "reward": 1.6208914518356323, "reward_std": 0.1067028883844614, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6287039518356323, "step": 1492 }, { "clip_ratio": 0.0, "completion_length": 291.8671875, "epoch": 0.72900390625, "grad_norm": 2.8589313865508745, "kl": 0.0618896484375, "learning_rate": 8.177490234375e-07, "loss": 0.0025, "reward": 1.6766908764839172, "reward_std": 0.06825340539216995, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6766908168792725, "step": 1493 }, { "clip_ratio": 0.0, "completion_length": 319.9765625, "epoch": 0.7294921875, "grad_norm": 2.124207189204002, "kl": 0.0760498046875, "learning_rate": 8.17626953125e-07, "loss": 0.003, "reward": 1.5833302736282349, "reward_std": 0.1273394152522087, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6067677438259125, "step": 1494 }, { "clip_ratio": 0.0, "completion_length": 261.203125, "epoch": 0.72998046875, "grad_norm": 2.3973851983698773, "kl": 0.067626953125, "learning_rate": 8.175048828125e-07, "loss": 0.0027, "reward": 1.7951343655586243, "reward_std": 0.024024500511586666, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7951343059539795, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 300.328125, "epoch": 0.73046875, "grad_norm": 1.3073917262465702, "kl": 0.055419921875, "learning_rate": 8.173828125e-07, "loss": 0.0022, "reward": 1.7663521766662598, "reward_std": 0.03600446879863739, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7663521468639374, "step": 1496 }, { "clip_ratio": 0.0, "completion_length": 405.453125, "epoch": 0.73095703125, "grad_norm": 1.6664383912533873, "kl": 0.056396484375, "learning_rate": 8.172607421874999e-07, "loss": 0.0023, "reward": 1.6536216139793396, "reward_std": 0.16317107900977135, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6770591139793396, "step": 1497 }, { "clip_ratio": 0.0, "completion_length": 298.703125, "epoch": 0.7314453125, "grad_norm": 2.6043460259230633, "kl": 0.052001953125, "learning_rate": 8.171386718749999e-07, "loss": 0.0021, "reward": 1.8190729022026062, "reward_std": 0.07409404963254929, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8190728724002838, "step": 1498 }, { "clip_ratio": 0.0, "completion_length": 411.03125, "epoch": 0.73193359375, "grad_norm": 1.6907569298922398, "kl": 0.0511474609375, "learning_rate": 8.170166015624999e-07, "loss": 0.002, "reward": 1.6964725852012634, "reward_std": 0.07454644329845905, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7042850852012634, "step": 1499 }, { "clip_ratio": 0.0, "completion_length": 280.859375, "epoch": 0.732421875, "grad_norm": 2.688496165901201, "kl": 0.060546875, "learning_rate": 8.1689453125e-07, "loss": 0.0024, "reward": 1.7929801940917969, "reward_std": 0.058575745671987534, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7929801940917969, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 301.828125, "epoch": 0.73291015625, "grad_norm": 2.8535089955662105, "kl": 0.069091796875, "learning_rate": 8.167724609375e-07, "loss": 0.0028, "reward": 1.6650619506835938, "reward_std": 0.07004339620471, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6650619506835938, "step": 1501 }, { "clip_ratio": 0.0, "completion_length": 237.9765625, "epoch": 0.7333984375, "grad_norm": 2.0939627649247443, "kl": 0.06982421875, "learning_rate": 8.16650390625e-07, "loss": 0.0028, "reward": 1.6487025022506714, "reward_std": 0.0736217126250267, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6487023830413818, "step": 1502 }, { "clip_ratio": 0.0, "completion_length": 363.0234375, "epoch": 0.73388671875, "grad_norm": 1.835761690123511, "kl": 0.058349609375, "learning_rate": 8.165283203125e-07, "loss": 0.0023, "reward": 1.6299150586128235, "reward_std": 0.16713447123765945, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6689775288105011, "step": 1503 }, { "clip_ratio": 0.0, "completion_length": 331.5703125, "epoch": 0.734375, "grad_norm": 1.7502378181934324, "kl": 0.0660400390625, "learning_rate": 8.164062499999999e-07, "loss": 0.0026, "reward": 1.7362866401672363, "reward_std": 0.06938813626766205, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7362865805625916, "step": 1504 }, { "clip_ratio": 0.0, "completion_length": 380.046875, "epoch": 0.73486328125, "grad_norm": 1.925910858116271, "kl": 0.045654296875, "learning_rate": 8.162841796874999e-07, "loss": 0.0018, "reward": 1.76516455411911, "reward_std": 0.08768405765295029, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7729770541191101, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 263.421875, "epoch": 0.7353515625, "grad_norm": 1.8350835868922448, "kl": 0.08154296875, "learning_rate": 8.16162109375e-07, "loss": 0.0033, "reward": 1.7183210253715515, "reward_std": 0.09937049448490143, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7261334359645844, "step": 1506 }, { "clip_ratio": 0.0, "completion_length": 275.296875, "epoch": 0.73583984375, "grad_norm": 1.2480606209927823, "kl": 0.056884765625, "learning_rate": 8.160400390625e-07, "loss": 0.0023, "reward": 1.800473690032959, "reward_std": 0.036239128559827805, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8004737496376038, "step": 1507 }, { "clip_ratio": 0.0, "completion_length": 263.53125, "epoch": 0.736328125, "grad_norm": 3.8267954710393566, "kl": 0.058349609375, "learning_rate": 8.1591796875e-07, "loss": 0.0023, "reward": 1.8211953043937683, "reward_std": 0.055461274459958076, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8211952745914459, "step": 1508 }, { "clip_ratio": 0.0, "completion_length": 428.0234375, "epoch": 0.73681640625, "grad_norm": 2.245415335980976, "kl": 0.056640625, "learning_rate": 8.157958984375e-07, "loss": 0.0023, "reward": 1.6801503896713257, "reward_std": 0.15717144310474396, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7114003896713257, "step": 1509 }, { "clip_ratio": 0.0, "completion_length": 355.7421875, "epoch": 0.7373046875, "grad_norm": 0.8186989855394431, "kl": 0.0587158203125, "learning_rate": 8.15673828125e-07, "loss": 0.0023, "reward": 1.7053476572036743, "reward_std": 0.09524018689990044, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7131602764129639, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 292.296875, "epoch": 0.73779296875, "grad_norm": 3.6173846814752046, "kl": 0.05419921875, "learning_rate": 8.155517578124999e-07, "loss": 0.0022, "reward": 1.7306747436523438, "reward_std": 0.10750394687056541, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7384872734546661, "step": 1511 }, { "clip_ratio": 0.0, "completion_length": 396.4921875, "epoch": 0.73828125, "grad_norm": 1.951320596223026, "kl": 0.066162109375, "learning_rate": 8.154296874999999e-07, "loss": 0.0026, "reward": 1.6698977947235107, "reward_std": 0.17629149928689003, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7245852947235107, "step": 1512 }, { "clip_ratio": 0.0, "completion_length": 297.9609375, "epoch": 0.73876953125, "grad_norm": 1.6633533126989715, "kl": 0.059326171875, "learning_rate": 8.153076171875e-07, "loss": 0.0024, "reward": 1.724345088005066, "reward_std": 0.07205065805464983, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7321575880050659, "step": 1513 }, { "clip_ratio": 0.0, "completion_length": 420.828125, "epoch": 0.7392578125, "grad_norm": 2.1092767969234525, "kl": 0.045166015625, "learning_rate": 8.15185546875e-07, "loss": 0.0018, "reward": 1.6450571417808533, "reward_std": 0.17002198845148087, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6997447311878204, "step": 1514 }, { "clip_ratio": 0.0, "completion_length": 361.03125, "epoch": 0.73974609375, "grad_norm": 1.0066903202791224, "kl": 0.06298828125, "learning_rate": 8.150634765625e-07, "loss": 0.0025, "reward": 1.6437667608261108, "reward_std": 0.10886374488472939, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6672042906284332, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 417.671875, "epoch": 0.740234375, "grad_norm": 1.0622351578079063, "kl": 0.060546875, "learning_rate": 8.1494140625e-07, "loss": 0.0024, "reward": 1.641761600971222, "reward_std": 0.15896305441856384, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.7120741009712219, "step": 1516 }, { "clip_ratio": 0.0, "completion_length": 288.7421875, "epoch": 0.74072265625, "grad_norm": 1.6360500618274396, "kl": 0.0634765625, "learning_rate": 8.148193359374999e-07, "loss": 0.0025, "reward": 1.7401413321495056, "reward_std": 0.042983127757906914, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.740141361951828, "step": 1517 }, { "clip_ratio": 0.0, "completion_length": 359.8828125, "epoch": 0.7412109375, "grad_norm": 1.010630811381439, "kl": 0.0531005859375, "learning_rate": 8.146972656249999e-07, "loss": 0.0021, "reward": 1.7111788988113403, "reward_std": 0.07130059599876404, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7111788690090179, "step": 1518 }, { "clip_ratio": 0.0, "completion_length": 308.734375, "epoch": 0.74169921875, "grad_norm": 0.8278353373184272, "kl": 0.0540771484375, "learning_rate": 8.145751953125e-07, "loss": 0.0022, "reward": 1.8538936972618103, "reward_std": 0.11458700150251389, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8695186972618103, "step": 1519 }, { "clip_ratio": 0.0, "completion_length": 369.609375, "epoch": 0.7421875, "grad_norm": 1.433959106877648, "kl": 0.0457763671875, "learning_rate": 8.14453125e-07, "loss": 0.0018, "reward": 1.8906309008598328, "reward_std": 0.06328525394201279, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8906309306621552, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 231.28125, "epoch": 0.74267578125, "grad_norm": 8.396740339740772, "kl": 0.075927734375, "learning_rate": 8.143310546875e-07, "loss": 0.003, "reward": 1.7359008193016052, "reward_std": 0.10937487334012985, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7359007894992828, "step": 1521 }, { "clip_ratio": 0.0, "completion_length": 237.2109375, "epoch": 0.7431640625, "grad_norm": 2.401371893041721, "kl": 0.0625, "learning_rate": 8.14208984375e-07, "loss": 0.0025, "reward": 1.8299207091331482, "reward_std": 0.05006260797381401, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8299207091331482, "step": 1522 }, { "clip_ratio": 0.0, "completion_length": 377.5390625, "epoch": 0.74365234375, "grad_norm": 4.919411402041056, "kl": 0.0687255859375, "learning_rate": 8.140869140625e-07, "loss": 0.0027, "reward": 1.7033655643463135, "reward_std": 0.13213280774652958, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7189904749393463, "step": 1523 }, { "clip_ratio": 0.0, "completion_length": 329.6640625, "epoch": 0.744140625, "grad_norm": 3.5288846647599983, "kl": 0.0665283203125, "learning_rate": 8.139648437499999e-07, "loss": 0.0027, "reward": 1.6737890839576721, "reward_std": 0.11412935890257359, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6816015839576721, "step": 1524 }, { "clip_ratio": 0.0, "completion_length": 340.6171875, "epoch": 0.74462890625, "grad_norm": 0.9645495205101616, "kl": 0.0430908203125, "learning_rate": 8.138427734374999e-07, "loss": 0.0017, "reward": 1.7587640285491943, "reward_std": 0.05514438450336456, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7587640285491943, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 283.875, "epoch": 0.7451171875, "grad_norm": 2.1056851901873856, "kl": 0.0640869140625, "learning_rate": 8.13720703125e-07, "loss": 0.0026, "reward": 1.68757826089859, "reward_std": 0.0944238007068634, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6875782012939453, "step": 1526 }, { "clip_ratio": 0.0, "completion_length": 294.8828125, "epoch": 0.74560546875, "grad_norm": 12.263008903251595, "kl": 0.0643310546875, "learning_rate": 8.135986328125e-07, "loss": 0.0026, "reward": 1.6637163162231445, "reward_std": 0.087074875831604, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6715288162231445, "step": 1527 }, { "clip_ratio": 0.0, "completion_length": 299.90625, "epoch": 0.74609375, "grad_norm": 0.7627901351331267, "kl": 0.050537109375, "learning_rate": 8.134765625e-07, "loss": 0.002, "reward": 1.863888144493103, "reward_std": 0.08409961871802807, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8717006146907806, "step": 1528 }, { "clip_ratio": 0.0, "completion_length": 343.4296875, "epoch": 0.74658203125, "grad_norm": 1.4130142844576217, "kl": 0.0479736328125, "learning_rate": 8.133544921875e-07, "loss": 0.0019, "reward": 1.8731828331947327, "reward_std": 0.04130223486572504, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8731828331947327, "step": 1529 }, { "clip_ratio": 0.0, "completion_length": 384.859375, "epoch": 0.7470703125, "grad_norm": 1.9620363842722353, "kl": 0.052978515625, "learning_rate": 8.132324218749999e-07, "loss": 0.0021, "reward": 1.7268319129943848, "reward_std": 0.07933101058006287, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7346444129943848, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 274.53125, "epoch": 0.74755859375, "grad_norm": 8.38649985748051, "kl": 0.0621337890625, "learning_rate": 8.131103515624999e-07, "loss": 0.0025, "reward": 1.716759443283081, "reward_std": 0.08667516149580479, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.724571943283081, "step": 1531 }, { "clip_ratio": 0.0, "completion_length": 272.7265625, "epoch": 0.748046875, "grad_norm": 1.7653142086688058, "kl": 0.076416015625, "learning_rate": 8.1298828125e-07, "loss": 0.003, "reward": 1.8313519358634949, "reward_std": 0.060592420399188995, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8313519060611725, "step": 1532 }, { "clip_ratio": 0.0, "completion_length": 241.2109375, "epoch": 0.74853515625, "grad_norm": 2.6054561123555398, "kl": 0.0703125, "learning_rate": 8.128662109375e-07, "loss": 0.0028, "reward": 1.737118661403656, "reward_std": 0.10131100192666054, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7371186316013336, "step": 1533 }, { "clip_ratio": 0.0, "completion_length": 228.8984375, "epoch": 0.7490234375, "grad_norm": 2.674357738045574, "kl": 0.0667724609375, "learning_rate": 8.12744140625e-07, "loss": 0.0027, "reward": 1.7102715373039246, "reward_std": 0.05548026505857706, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.710271567106247, "step": 1534 }, { "clip_ratio": 0.0, "completion_length": 256.125, "epoch": 0.74951171875, "grad_norm": 4.73739128631788, "kl": 0.0626220703125, "learning_rate": 8.126220703125e-07, "loss": 0.0025, "reward": 1.7168715000152588, "reward_std": 0.04072634130716324, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7168715000152588, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 312.046875, "epoch": 0.75, "grad_norm": 1.1110552148698716, "kl": 0.0604248046875, "learning_rate": 8.125e-07, "loss": 0.0024, "reward": 1.8226452469825745, "reward_std": 0.07101005595177412, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8226452171802521, "step": 1536 }, { "clip_ratio": 0.0, "completion_length": 268.375, "epoch": 0.75048828125, "grad_norm": 1.8999368365222582, "kl": 0.06689453125, "learning_rate": 8.123779296874999e-07, "loss": 0.0027, "reward": 1.7307413220405579, "reward_std": 0.06915171444416046, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7307413220405579, "step": 1537 }, { "clip_ratio": 0.0, "completion_length": 315.84375, "epoch": 0.7509765625, "grad_norm": 1.3088249128930414, "kl": 0.0545654296875, "learning_rate": 8.122558593749999e-07, "loss": 0.0022, "reward": 1.7781055569648743, "reward_std": 0.11129429191350937, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7781055569648743, "step": 1538 }, { "clip_ratio": 0.0, "completion_length": 326.0390625, "epoch": 0.75146484375, "grad_norm": 1.8135902702393156, "kl": 0.064453125, "learning_rate": 8.121337890625e-07, "loss": 0.0026, "reward": 1.7776609063148499, "reward_std": 0.05267609283328056, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7776609361171722, "step": 1539 }, { "clip_ratio": 0.0, "completion_length": 343.5703125, "epoch": 0.751953125, "grad_norm": 4.248969243149114, "kl": 0.061767578125, "learning_rate": 8.1201171875e-07, "loss": 0.0025, "reward": 1.6320134997367859, "reward_std": 0.16212911903858185, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.671076089143753, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 320.0703125, "epoch": 0.75244140625, "grad_norm": 3.2351824322471487, "kl": 0.0506591796875, "learning_rate": 8.118896484375e-07, "loss": 0.002, "reward": 1.8071049451828003, "reward_std": 0.03557584714144468, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8071048855781555, "step": 1541 }, { "clip_ratio": 0.0, "completion_length": 301.484375, "epoch": 0.7529296875, "grad_norm": 2.1607593809932912, "kl": 0.0684814453125, "learning_rate": 8.11767578125e-07, "loss": 0.0027, "reward": 1.7182350158691406, "reward_std": 0.0703788474202156, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7182350158691406, "step": 1542 }, { "clip_ratio": 0.0, "completion_length": 345.171875, "epoch": 0.75341796875, "grad_norm": 2.28986197589543, "kl": 0.0521240234375, "learning_rate": 8.116455078124999e-07, "loss": 0.0021, "reward": 1.6649247407913208, "reward_std": 0.10756101086735725, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.664924681186676, "step": 1543 }, { "clip_ratio": 0.0, "completion_length": 274.265625, "epoch": 0.75390625, "grad_norm": 1.3873712534548976, "kl": 0.0587158203125, "learning_rate": 8.115234374999999e-07, "loss": 0.0023, "reward": 1.7909113764762878, "reward_std": 0.05301499832421541, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7909113466739655, "step": 1544 }, { "clip_ratio": 0.0, "completion_length": 332.96875, "epoch": 0.75439453125, "grad_norm": 1.3747071192917304, "kl": 0.0538330078125, "learning_rate": 8.114013671875e-07, "loss": 0.0021, "reward": 1.5880799293518066, "reward_std": 0.05071425810456276, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5880799889564514, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 280.078125, "epoch": 0.7548828125, "grad_norm": 2.082728546980293, "kl": 0.0687255859375, "learning_rate": 8.11279296875e-07, "loss": 0.0027, "reward": 1.7272522449493408, "reward_std": 0.11839665472507477, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7272522151470184, "step": 1546 }, { "clip_ratio": 0.0, "completion_length": 289.8125, "epoch": 0.75537109375, "grad_norm": 4.2783261615543555, "kl": 0.0557861328125, "learning_rate": 8.111572265625e-07, "loss": 0.0022, "reward": 1.6325949430465698, "reward_std": 0.08264567703008652, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6716574430465698, "step": 1547 }, { "clip_ratio": 0.0, "completion_length": 326.4609375, "epoch": 0.755859375, "grad_norm": 1.1440391734134507, "kl": 0.046630859375, "learning_rate": 8.1103515625e-07, "loss": 0.0019, "reward": 1.71382474899292, "reward_std": 0.045681871473789215, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7138247489929199, "step": 1548 }, { "clip_ratio": 0.0, "completion_length": 279.328125, "epoch": 0.75634765625, "grad_norm": 10.451777449180478, "kl": 0.0516357421875, "learning_rate": 8.109130859375e-07, "loss": 0.0021, "reward": 1.6753268241882324, "reward_std": 0.021205293014645576, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.67532679438591, "step": 1549 }, { "clip_ratio": 0.0, "completion_length": 293.796875, "epoch": 0.7568359375, "grad_norm": 2.3203833368592903, "kl": 0.0751953125, "learning_rate": 8.107910156249999e-07, "loss": 0.003, "reward": 1.7479038834571838, "reward_std": 0.1040516346693039, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7557163834571838, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 459.453125, "epoch": 0.75732421875, "grad_norm": 1.314358162889798, "kl": 0.05078125, "learning_rate": 8.106689453124999e-07, "loss": 0.002, "reward": 1.6848346590995789, "reward_std": 0.15354808419942856, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7160846590995789, "step": 1551 }, { "clip_ratio": 0.0, "completion_length": 323.9921875, "epoch": 0.7578125, "grad_norm": 2.609690852803103, "kl": 0.055908203125, "learning_rate": 8.10546875e-07, "loss": 0.0022, "reward": 1.7020440697669983, "reward_std": 0.11408869549632072, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7176690995693207, "step": 1552 }, { "clip_ratio": 0.0, "completion_length": 250.4375, "epoch": 0.75830078125, "grad_norm": 1.6918813158728625, "kl": 0.0628662109375, "learning_rate": 8.104248046875e-07, "loss": 0.0025, "reward": 1.7733458280563354, "reward_std": 0.1404724046587944, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7889708578586578, "step": 1553 }, { "clip_ratio": 0.0, "completion_length": 273.5546875, "epoch": 0.7587890625, "grad_norm": 1.7954408988834158, "kl": 0.059326171875, "learning_rate": 8.10302734375e-07, "loss": 0.0024, "reward": 1.7370250225067139, "reward_std": 0.059584882110357285, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7370250523090363, "step": 1554 }, { "clip_ratio": 0.0, "completion_length": 332.9375, "epoch": 0.75927734375, "grad_norm": 1.500988736398781, "kl": 0.047119140625, "learning_rate": 8.101806640625e-07, "loss": 0.0019, "reward": 1.8506624698638916, "reward_std": 0.09908335283398628, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8584749698638916, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 398.3359375, "epoch": 0.759765625, "grad_norm": 2.5780846344055774, "kl": 0.04541015625, "learning_rate": 8.100585937499999e-07, "loss": 0.0018, "reward": 1.6247803568840027, "reward_std": 0.14903922379016876, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6638428568840027, "step": 1556 }, { "clip_ratio": 0.0, "completion_length": 322.1484375, "epoch": 0.76025390625, "grad_norm": 1.6802574412935145, "kl": 0.0535888671875, "learning_rate": 8.099365234374999e-07, "loss": 0.0021, "reward": 1.7854554653167725, "reward_std": 0.0747103076428175, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7854554057121277, "step": 1557 }, { "clip_ratio": 0.0, "completion_length": 349.140625, "epoch": 0.7607421875, "grad_norm": 3.53135423693984, "kl": 0.0521240234375, "learning_rate": 8.098144531249999e-07, "loss": 0.0021, "reward": 1.5581438541412354, "reward_std": 0.08005472645163536, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6050188541412354, "step": 1558 }, { "clip_ratio": 0.0, "completion_length": 332.15625, "epoch": 0.76123046875, "grad_norm": 1.2263929278952201, "kl": 0.0550537109375, "learning_rate": 8.096923828125e-07, "loss": 0.0022, "reward": 1.7725134491920471, "reward_std": 0.09821948781609535, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.8115759491920471, "step": 1559 }, { "clip_ratio": 0.0, "completion_length": 218.1484375, "epoch": 0.76171875, "grad_norm": 4.559002764522234, "kl": 0.068115234375, "learning_rate": 8.095703125e-07, "loss": 0.0027, "reward": 1.747908115386963, "reward_std": 0.017046626191586256, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7479080855846405, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 361.4765625, "epoch": 0.76220703125, "grad_norm": 2.6821850910454974, "kl": 0.0567626953125, "learning_rate": 8.094482421875e-07, "loss": 0.0023, "reward": 1.6794939041137695, "reward_std": 0.0593208484351635, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6794938743114471, "step": 1561 }, { "clip_ratio": 0.0, "completion_length": 283.1796875, "epoch": 0.7626953125, "grad_norm": 1.4073846788389592, "kl": 0.05224609375, "learning_rate": 8.09326171875e-07, "loss": 0.0021, "reward": 1.820260226726532, "reward_std": 0.04938836395740509, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8202601671218872, "step": 1562 }, { "clip_ratio": 0.0, "completion_length": 212.3515625, "epoch": 0.76318359375, "grad_norm": 1.0253536738713116, "kl": 0.068359375, "learning_rate": 8.092041015624999e-07, "loss": 0.0027, "reward": 1.6596548557281494, "reward_std": 0.02243457455188036, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6596548557281494, "step": 1563 }, { "clip_ratio": 0.0, "completion_length": 217.390625, "epoch": 0.763671875, "grad_norm": 0.802675306572281, "kl": 0.06494140625, "learning_rate": 8.090820312499999e-07, "loss": 0.0026, "reward": 1.7809888124465942, "reward_std": 0.038061970844864845, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7809888422489166, "step": 1564 }, { "clip_ratio": 0.0, "completion_length": 309.953125, "epoch": 0.76416015625, "grad_norm": 1.8997263090024157, "kl": 0.0560302734375, "learning_rate": 8.089599609375e-07, "loss": 0.0022, "reward": 1.653722107410431, "reward_std": 0.12127144634723663, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6615345478057861, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 313.703125, "epoch": 0.7646484375, "grad_norm": 1.958461815256613, "kl": 0.0506591796875, "learning_rate": 8.08837890625e-07, "loss": 0.002, "reward": 1.713492214679718, "reward_std": 0.1120409145951271, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7291173040866852, "step": 1566 }, { "clip_ratio": 0.0, "completion_length": 335.421875, "epoch": 0.76513671875, "grad_norm": 1.0413990963846795, "kl": 0.0498046875, "learning_rate": 8.087158203125e-07, "loss": 0.002, "reward": 1.7677712440490723, "reward_std": 0.09032433852553368, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7833963632583618, "step": 1567 }, { "clip_ratio": 0.0, "completion_length": 332.4765625, "epoch": 0.765625, "grad_norm": 2.975801159257183, "kl": 0.0538330078125, "learning_rate": 8.0859375e-07, "loss": 0.0022, "reward": 1.6949644684791565, "reward_std": 0.1156335175037384, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7105894386768341, "step": 1568 }, { "clip_ratio": 0.0, "completion_length": 268.09375, "epoch": 0.76611328125, "grad_norm": 4.913374457578766, "kl": 0.065185546875, "learning_rate": 8.084716796874999e-07, "loss": 0.0026, "reward": 1.8527822494506836, "reward_std": 0.03366856276988983, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.852782130241394, "step": 1569 }, { "clip_ratio": 0.0, "completion_length": 319.296875, "epoch": 0.7666015625, "grad_norm": 1.240155973042573, "kl": 0.0506591796875, "learning_rate": 8.083496093749999e-07, "loss": 0.002, "reward": 1.7566204071044922, "reward_std": 0.0804799273610115, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7644328773021698, "step": 1570 }, { "clip_ratio": 0.0, "completion_length": 284.1640625, "epoch": 0.76708984375, "grad_norm": 1.4265984505569123, "kl": 0.052001953125, "learning_rate": 8.082275390624999e-07, "loss": 0.0021, "reward": 1.7467219233512878, "reward_std": 0.017143062315881252, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7467218637466431, "step": 1571 }, { "clip_ratio": 0.0, "completion_length": 221.53125, "epoch": 0.767578125, "grad_norm": 1.5345523611187817, "kl": 0.060546875, "learning_rate": 8.0810546875e-07, "loss": 0.0024, "reward": 1.8325753211975098, "reward_std": 0.04120937455445528, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8325753211975098, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 402.21875, "epoch": 0.76806640625, "grad_norm": 1.4045255992327337, "kl": 0.0546875, "learning_rate": 8.079833984375e-07, "loss": 0.0022, "reward": 1.6535959243774414, "reward_std": 0.10518948920071125, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6770334243774414, "step": 1573 }, { "clip_ratio": 0.0, "completion_length": 477.7421875, "epoch": 0.7685546875, "grad_norm": 1.4480910955522743, "kl": 0.056396484375, "learning_rate": 8.07861328125e-07, "loss": 0.0023, "reward": 1.575450837612152, "reward_std": 0.0833788514137268, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6223257780075073, "step": 1574 }, { "clip_ratio": 0.0, "completion_length": 328.40625, "epoch": 0.76904296875, "grad_norm": 1.6388667341544991, "kl": 0.0498046875, "learning_rate": 8.077392578125e-07, "loss": 0.002, "reward": 1.776998221874237, "reward_std": 0.09045989066362381, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8004356920719147, "step": 1575 }, { "clip_ratio": 0.0, "completion_length": 335.3671875, "epoch": 0.76953125, "grad_norm": 4.580808288577953, "kl": 0.06787109375, "learning_rate": 8.076171874999999e-07, "loss": 0.0027, "reward": 1.7042137384414673, "reward_std": 0.15083208680152893, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7354637086391449, "step": 1576 }, { "clip_ratio": 0.0, "completion_length": 357.4765625, "epoch": 0.77001953125, "grad_norm": 2.019507688897045, "kl": 0.0489501953125, "learning_rate": 8.074951171874999e-07, "loss": 0.002, "reward": 1.6543389558792114, "reward_std": 0.1535024270415306, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7090264856815338, "step": 1577 }, { "clip_ratio": 0.0, "completion_length": 338.75, "epoch": 0.7705078125, "grad_norm": 1.5898119888098918, "kl": 0.0535888671875, "learning_rate": 8.07373046875e-07, "loss": 0.0021, "reward": 1.6351118683815002, "reward_std": 0.1109085101634264, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6585493683815002, "step": 1578 }, { "clip_ratio": 0.0, "completion_length": 308.078125, "epoch": 0.77099609375, "grad_norm": 0.8244536832784812, "kl": 0.0699462890625, "learning_rate": 8.072509765625e-07, "loss": 0.0028, "reward": 1.7688223123550415, "reward_std": 0.048665997572243214, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7688223123550415, "step": 1579 }, { "clip_ratio": 0.0, "completion_length": 362.9140625, "epoch": 0.771484375, "grad_norm": 7.833675662090654, "kl": 0.0458984375, "learning_rate": 8.0712890625e-07, "loss": 0.0018, "reward": 1.766732096672058, "reward_std": 0.037401504814624786, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7667320668697357, "step": 1580 }, { "clip_ratio": 0.0, "completion_length": 329.5703125, "epoch": 0.77197265625, "grad_norm": 1.272735275661589, "kl": 0.0555419921875, "learning_rate": 8.070068359375e-07, "loss": 0.0022, "reward": 1.8723936080932617, "reward_std": 0.06921904534101486, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8880185484886169, "step": 1581 }, { "clip_ratio": 0.0, "completion_length": 349.109375, "epoch": 0.7724609375, "grad_norm": 1.2808243375099266, "kl": 0.0645751953125, "learning_rate": 8.068847656249999e-07, "loss": 0.0026, "reward": 1.6261619925498962, "reward_std": 0.07076285779476166, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6261619329452515, "step": 1582 }, { "clip_ratio": 0.0, "completion_length": 354.578125, "epoch": 0.77294921875, "grad_norm": 2.3321912897118713, "kl": 0.078369140625, "learning_rate": 8.067626953124999e-07, "loss": 0.0031, "reward": 1.7845726013183594, "reward_std": 0.15707527101039886, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8001976609230042, "step": 1583 }, { "clip_ratio": 0.0, "completion_length": 374.8984375, "epoch": 0.7734375, "grad_norm": 1.0990319632980867, "kl": 0.0711669921875, "learning_rate": 8.066406249999999e-07, "loss": 0.0028, "reward": 1.7625375986099243, "reward_std": 0.05330556631088257, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7703501284122467, "step": 1584 }, { "clip_ratio": 0.0, "completion_length": 308.3359375, "epoch": 0.77392578125, "grad_norm": 2.118844572781893, "kl": 0.06396484375, "learning_rate": 8.065185546875e-07, "loss": 0.0026, "reward": 1.7572776079177856, "reward_std": 0.09189710766077042, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7572776079177856, "step": 1585 }, { "clip_ratio": 0.0, "completion_length": 288.7734375, "epoch": 0.7744140625, "grad_norm": 3.9816311052864553, "kl": 0.0728759765625, "learning_rate": 8.06396484375e-07, "loss": 0.0029, "reward": 1.7010331749916077, "reward_std": 0.1283012256026268, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7166581749916077, "step": 1586 }, { "clip_ratio": 0.0, "completion_length": 325.9140625, "epoch": 0.77490234375, "grad_norm": 1.3209118767681105, "kl": 0.050537109375, "learning_rate": 8.062744140625e-07, "loss": 0.002, "reward": 1.8046178817749023, "reward_std": 0.10538148693740368, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8124303817749023, "step": 1587 }, { "clip_ratio": 0.0, "completion_length": 359.1484375, "epoch": 0.775390625, "grad_norm": 2.5132872532943624, "kl": 0.046875, "learning_rate": 8.0615234375e-07, "loss": 0.0019, "reward": 1.7433744668960571, "reward_std": 0.09972074255347252, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.75118687748909, "step": 1588 }, { "clip_ratio": 0.0, "completion_length": 342.2578125, "epoch": 0.77587890625, "grad_norm": 3.40521583010745, "kl": 0.085693359375, "learning_rate": 8.060302734374999e-07, "loss": 0.0034, "reward": 1.628357172012329, "reward_std": 0.09522592648863792, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6361695826053619, "step": 1589 }, { "clip_ratio": 0.0, "completion_length": 373.2265625, "epoch": 0.7763671875, "grad_norm": 3.0455379805887093, "kl": 0.0606689453125, "learning_rate": 8.059082031249999e-07, "loss": 0.0024, "reward": 1.641897439956665, "reward_std": 0.08232817053794861, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6418974995613098, "step": 1590 }, { "clip_ratio": 0.0, "completion_length": 291.2734375, "epoch": 0.77685546875, "grad_norm": 2.2430367344910427, "kl": 0.0614013671875, "learning_rate": 8.057861328125e-07, "loss": 0.0025, "reward": 1.768738031387329, "reward_std": 0.07021267339587212, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7687380015850067, "step": 1591 }, { "clip_ratio": 0.0, "completion_length": 284.3828125, "epoch": 0.77734375, "grad_norm": 2.20938289635979, "kl": 0.069091796875, "learning_rate": 8.056640625e-07, "loss": 0.0028, "reward": 1.8064799904823303, "reward_std": 0.04143555276095867, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8064799904823303, "step": 1592 }, { "clip_ratio": 0.0, "completion_length": 314.671875, "epoch": 0.77783203125, "grad_norm": 2.7105879555153303, "kl": 0.060791015625, "learning_rate": 8.055419921875e-07, "loss": 0.0024, "reward": 1.6281877756118774, "reward_std": 0.1271475814282894, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6281877756118774, "step": 1593 }, { "clip_ratio": 0.0, "completion_length": 266.84375, "epoch": 0.7783203125, "grad_norm": 5.064693794868452, "kl": 0.0628662109375, "learning_rate": 8.05419921875e-07, "loss": 0.0025, "reward": 1.704953670501709, "reward_std": 0.09572456032037735, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.704953670501709, "step": 1594 }, { "clip_ratio": 0.0, "completion_length": 304.296875, "epoch": 0.77880859375, "grad_norm": 1.2322362890361453, "kl": 0.0523681640625, "learning_rate": 8.052978515624999e-07, "loss": 0.0021, "reward": 1.7966317534446716, "reward_std": 0.09780865162611008, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8044441640377045, "step": 1595 }, { "clip_ratio": 0.0, "completion_length": 300.6875, "epoch": 0.779296875, "grad_norm": 2.587066136267263, "kl": 0.0574951171875, "learning_rate": 8.051757812499999e-07, "loss": 0.0023, "reward": 1.748351275920868, "reward_std": 0.11127368733286858, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7483512759208679, "step": 1596 }, { "clip_ratio": 0.0, "completion_length": 224.96875, "epoch": 0.77978515625, "grad_norm": 8.678514629911282, "kl": 0.0655517578125, "learning_rate": 8.050537109374999e-07, "loss": 0.0026, "reward": 1.7842652797698975, "reward_std": 0.056360941380262375, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7842652797698975, "step": 1597 }, { "clip_ratio": 0.0, "completion_length": 285.6796875, "epoch": 0.7802734375, "grad_norm": 3.8186079651609486, "kl": 0.0589599609375, "learning_rate": 8.04931640625e-07, "loss": 0.0024, "reward": 1.8417965769767761, "reward_std": 0.07890859059989452, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8496091067790985, "step": 1598 }, { "clip_ratio": 0.0, "completion_length": 301.578125, "epoch": 0.78076171875, "grad_norm": 2.6414016724776146, "kl": 0.0634765625, "learning_rate": 8.048095703125e-07, "loss": 0.0025, "reward": 1.8113459348678589, "reward_std": 0.042547447606921196, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8113458752632141, "step": 1599 }, { "clip_ratio": 0.0, "completion_length": 310.65625, "epoch": 0.78125, "grad_norm": 2.7042761180533192, "kl": 0.0718994140625, "learning_rate": 8.046875e-07, "loss": 0.0029, "reward": 1.7982996702194214, "reward_std": 0.10386446584016085, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8061122000217438, "step": 1600 }, { "clip_ratio": 0.0, "completion_length": 248.8203125, "epoch": 0.78173828125, "grad_norm": 1.1262604316715779, "kl": 0.078125, "learning_rate": 8.045654296875e-07, "loss": 0.0031, "reward": 1.8404181599617004, "reward_std": 0.03405761159956455, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8404182195663452, "step": 1601 }, { "clip_ratio": 0.0, "completion_length": 263.75, "epoch": 0.7822265625, "grad_norm": 2.3428168026992586, "kl": 0.064697265625, "learning_rate": 8.044433593749999e-07, "loss": 0.0026, "reward": 1.770385503768921, "reward_std": 0.06261442601680756, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7703855335712433, "step": 1602 }, { "clip_ratio": 0.0, "completion_length": 240.7109375, "epoch": 0.78271484375, "grad_norm": 1.4573875580192406, "kl": 0.0693359375, "learning_rate": 8.043212890624999e-07, "loss": 0.0028, "reward": 1.8531925678253174, "reward_std": 0.04907483607530594, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8531925678253174, "step": 1603 }, { "clip_ratio": 0.0, "completion_length": 451.0859375, "epoch": 0.783203125, "grad_norm": 2.097256901008272, "kl": 0.04833984375, "learning_rate": 8.0419921875e-07, "loss": 0.0019, "reward": 1.6914434432983398, "reward_std": 0.12963934242725372, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7070684731006622, "step": 1604 }, { "clip_ratio": 0.0, "completion_length": 338.40625, "epoch": 0.78369140625, "grad_norm": 1.9320837029873765, "kl": 0.0601806640625, "learning_rate": 8.040771484375e-07, "loss": 0.0024, "reward": 1.680255651473999, "reward_std": 0.1190883181989193, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6880680620670319, "step": 1605 }, { "clip_ratio": 0.0, "completion_length": 273.9375, "epoch": 0.7841796875, "grad_norm": 1.0873386862814498, "kl": 0.0552978515625, "learning_rate": 8.03955078125e-07, "loss": 0.0022, "reward": 1.7301841378211975, "reward_std": 0.03086886089295149, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7301841080188751, "step": 1606 }, { "clip_ratio": 0.0, "completion_length": 254.640625, "epoch": 0.78466796875, "grad_norm": 1.305516177619871, "kl": 0.07568359375, "learning_rate": 8.038330078125e-07, "loss": 0.003, "reward": 1.651597023010254, "reward_std": 0.0745653323829174, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6515969634056091, "step": 1607 }, { "clip_ratio": 0.0, "completion_length": 238.890625, "epoch": 0.78515625, "grad_norm": 1.853930940155658, "kl": 0.0611572265625, "learning_rate": 8.037109375e-07, "loss": 0.0024, "reward": 1.7472956776618958, "reward_std": 0.036410000175237656, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.747295618057251, "step": 1608 }, { "clip_ratio": 0.0, "completion_length": 253.0390625, "epoch": 0.78564453125, "grad_norm": 1.5524349811107778, "kl": 0.0557861328125, "learning_rate": 8.035888671874999e-07, "loss": 0.0022, "reward": 1.757796585559845, "reward_std": 0.096245177090168, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.773421585559845, "step": 1609 }, { "clip_ratio": 0.0, "completion_length": 223.859375, "epoch": 0.7861328125, "grad_norm": 12.21460617267679, "kl": 0.0509033203125, "learning_rate": 8.034667968749999e-07, "loss": 0.002, "reward": 1.6556835770606995, "reward_std": 0.08177720569074154, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6556835770606995, "step": 1610 }, { "clip_ratio": 0.0, "completion_length": 334.9609375, "epoch": 0.78662109375, "grad_norm": 1.2988283875580688, "kl": 0.0513916015625, "learning_rate": 8.033447265625e-07, "loss": 0.0021, "reward": 1.7968943119049072, "reward_std": 0.03818834759294987, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7968942821025848, "step": 1611 }, { "clip_ratio": 0.0, "completion_length": 318.8359375, "epoch": 0.787109375, "grad_norm": 1.853996721692202, "kl": 0.0478515625, "learning_rate": 8.0322265625e-07, "loss": 0.0019, "reward": 1.6934837102890015, "reward_std": 0.06388338282704353, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6934837400913239, "step": 1612 }, { "clip_ratio": 0.0, "completion_length": 280.0859375, "epoch": 0.78759765625, "grad_norm": 0.9164065489554429, "kl": 0.0616455078125, "learning_rate": 8.031005859375e-07, "loss": 0.0025, "reward": 1.6735413074493408, "reward_std": 0.0621509775519371, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6735413372516632, "step": 1613 }, { "clip_ratio": 0.0, "completion_length": 327.4375, "epoch": 0.7880859375, "grad_norm": 2.82590067781753, "kl": 0.0567626953125, "learning_rate": 8.02978515625e-07, "loss": 0.0023, "reward": 1.6877517700195312, "reward_std": 0.0880160890519619, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6877517700195312, "step": 1614 }, { "clip_ratio": 0.0, "completion_length": 264.6640625, "epoch": 0.78857421875, "grad_norm": 20.08903138023436, "kl": 0.062744140625, "learning_rate": 8.028564453124999e-07, "loss": 0.0025, "reward": 1.762800931930542, "reward_std": 0.0888824425637722, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.762800931930542, "step": 1615 }, { "clip_ratio": 0.0, "completion_length": 276.515625, "epoch": 0.7890625, "grad_norm": 1.7330784289366532, "kl": 0.0504150390625, "learning_rate": 8.027343749999999e-07, "loss": 0.002, "reward": 1.8259278535842896, "reward_std": 0.08520985394716263, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8415527939796448, "step": 1616 }, { "clip_ratio": 0.0, "completion_length": 274.75, "epoch": 0.78955078125, "grad_norm": 2.062097230316505, "kl": 0.06298828125, "learning_rate": 8.026123046875e-07, "loss": 0.0025, "reward": 1.7127341032028198, "reward_std": 0.049558693543076515, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.712734043598175, "step": 1617 }, { "clip_ratio": 0.0, "completion_length": 273.4140625, "epoch": 0.7900390625, "grad_norm": 1.6783295426613631, "kl": 0.065185546875, "learning_rate": 8.02490234375e-07, "loss": 0.0026, "reward": 1.8034849166870117, "reward_std": 0.10387159883975983, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8034849166870117, "step": 1618 }, { "clip_ratio": 0.0, "completion_length": 230.15625, "epoch": 0.79052734375, "grad_norm": 1.6933921135873553, "kl": 0.0609130859375, "learning_rate": 8.023681640625e-07, "loss": 0.0024, "reward": 1.8547474145889282, "reward_std": 0.07110052555799484, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8547475039958954, "step": 1619 }, { "clip_ratio": 0.0, "completion_length": 287.0234375, "epoch": 0.791015625, "grad_norm": 2.7831972038485997, "kl": 0.070068359375, "learning_rate": 8.0224609375e-07, "loss": 0.0028, "reward": 1.6170747876167297, "reward_std": 0.07357279863208532, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6170747876167297, "step": 1620 }, { "clip_ratio": 0.0, "completion_length": 366.2109375, "epoch": 0.79150390625, "grad_norm": 1.7306710169661095, "kl": 0.060302734375, "learning_rate": 8.021240234375e-07, "loss": 0.0024, "reward": 1.7135973572731018, "reward_std": 0.04537785239517689, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7135973572731018, "step": 1621 }, { "clip_ratio": 0.0, "completion_length": 286.7265625, "epoch": 0.7919921875, "grad_norm": 2.1555906518927137, "kl": 0.0592041015625, "learning_rate": 8.020019531249999e-07, "loss": 0.0024, "reward": 1.6998938918113708, "reward_std": 0.07550182193517685, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6998938918113708, "step": 1622 }, { "clip_ratio": 0.0, "completion_length": 284.84375, "epoch": 0.79248046875, "grad_norm": 1.4507712079046071, "kl": 0.0517578125, "learning_rate": 8.018798828124999e-07, "loss": 0.0021, "reward": 1.7838214635849, "reward_std": 0.08774328604340553, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7838214635848999, "step": 1623 }, { "clip_ratio": 0.0, "completion_length": 345.6171875, "epoch": 0.79296875, "grad_norm": 1.0757521287144136, "kl": 0.041259765625, "learning_rate": 8.017578125e-07, "loss": 0.0017, "reward": 1.8050659894943237, "reward_std": 0.036978503689169884, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.805065929889679, "step": 1624 }, { "clip_ratio": 0.0, "completion_length": 334.75, "epoch": 0.79345703125, "grad_norm": 7.947108074112008, "kl": 0.051513671875, "learning_rate": 8.016357421875e-07, "loss": 0.0021, "reward": 1.6861704587936401, "reward_std": 0.038627080619335175, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6861703991889954, "step": 1625 }, { "clip_ratio": 0.0, "completion_length": 318.2734375, "epoch": 0.7939453125, "grad_norm": 1.9454989447472923, "kl": 0.06591796875, "learning_rate": 8.01513671875e-07, "loss": 0.0026, "reward": 1.6758694648742676, "reward_std": 0.05895833298563957, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6758694648742676, "step": 1626 }, { "clip_ratio": 0.0, "completion_length": 281.75, "epoch": 0.79443359375, "grad_norm": 0.7165968159288184, "kl": 0.0621337890625, "learning_rate": 8.013916015625e-07, "loss": 0.0025, "reward": 1.6823166608810425, "reward_std": 0.02876619715243578, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6823166906833649, "step": 1627 }, { "clip_ratio": 0.0, "completion_length": 340.921875, "epoch": 0.794921875, "grad_norm": 1.5106278082685252, "kl": 0.0543212890625, "learning_rate": 8.012695312499999e-07, "loss": 0.0022, "reward": 1.6847730875015259, "reward_std": 0.13465760834515095, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7082105278968811, "step": 1628 }, { "clip_ratio": 0.0, "completion_length": 299.421875, "epoch": 0.79541015625, "grad_norm": 3.32714688701594, "kl": 0.063720703125, "learning_rate": 8.011474609374999e-07, "loss": 0.0025, "reward": 1.7635406851768494, "reward_std": 0.10127770528197289, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7635407149791718, "step": 1629 }, { "clip_ratio": 0.0, "completion_length": 287.3984375, "epoch": 0.7958984375, "grad_norm": 1.051302145633701, "kl": 0.070556640625, "learning_rate": 8.01025390625e-07, "loss": 0.0028, "reward": 1.6394500732421875, "reward_std": 0.11420125816948712, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6707001626491547, "step": 1630 }, { "clip_ratio": 0.0, "completion_length": 323.1640625, "epoch": 0.79638671875, "grad_norm": 4.617980559742919, "kl": 0.056396484375, "learning_rate": 8.009033203125e-07, "loss": 0.0023, "reward": 1.6397234201431274, "reward_std": 0.1288561257533729, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6553484499454498, "step": 1631 }, { "clip_ratio": 0.0, "completion_length": 288.515625, "epoch": 0.796875, "grad_norm": 3.2567614792509887, "kl": 0.065185546875, "learning_rate": 8.0078125e-07, "loss": 0.0026, "reward": 1.552538812160492, "reward_std": 0.054762667044997215, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5525388121604919, "step": 1632 }, { "clip_ratio": 0.0, "completion_length": 271.671875, "epoch": 0.79736328125, "grad_norm": 1.051014835566516, "kl": 0.0572509765625, "learning_rate": 8.006591796875e-07, "loss": 0.0023, "reward": 1.8010156750679016, "reward_std": 0.047447606921195984, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8088282346725464, "step": 1633 }, { "clip_ratio": 0.0, "completion_length": 291.84375, "epoch": 0.7978515625, "grad_norm": 2.2192496100152415, "kl": 0.0694580078125, "learning_rate": 8.00537109375e-07, "loss": 0.0028, "reward": 1.7100372314453125, "reward_std": 0.07988406717777252, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7100372314453125, "step": 1634 }, { "clip_ratio": 0.0, "completion_length": 344.8828125, "epoch": 0.79833984375, "grad_norm": 6.815933212112274, "kl": 0.0489501953125, "learning_rate": 8.004150390624999e-07, "loss": 0.002, "reward": 1.5929869413375854, "reward_std": 0.12428093701601028, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6476744413375854, "step": 1635 }, { "clip_ratio": 0.0, "completion_length": 338.5625, "epoch": 0.798828125, "grad_norm": 4.025394315862172, "kl": 0.057373046875, "learning_rate": 8.002929687499999e-07, "loss": 0.0023, "reward": 1.6940549612045288, "reward_std": 0.12819510325789452, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7174924910068512, "step": 1636 }, { "clip_ratio": 0.0, "completion_length": 200.109375, "epoch": 0.79931640625, "grad_norm": 58.06981888749972, "kl": 0.065673828125, "learning_rate": 8.001708984375e-07, "loss": 0.0026, "reward": 1.725978434085846, "reward_std": 0.02466776454821229, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.725978434085846, "step": 1637 }, { "clip_ratio": 0.0, "completion_length": 265.09375, "epoch": 0.7998046875, "grad_norm": 7.519107809173853, "kl": 0.0826416015625, "learning_rate": 8.00048828125e-07, "loss": 0.0033, "reward": 1.7485601305961609, "reward_std": 0.04266110900789499, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7485601305961609, "step": 1638 }, { "clip_ratio": 0.0, "completion_length": 319.234375, "epoch": 0.80029296875, "grad_norm": 3.1218867448796446, "kl": 0.0673828125, "learning_rate": 7.999267578125e-07, "loss": 0.0027, "reward": 1.772888958454132, "reward_std": 0.04786605387926102, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7728888988494873, "step": 1639 }, { "clip_ratio": 0.0, "completion_length": 318.421875, "epoch": 0.80078125, "grad_norm": 3.916540661111048, "kl": 0.064697265625, "learning_rate": 7.998046875e-07, "loss": 0.0026, "reward": 1.6346943378448486, "reward_std": 0.15634194761514664, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6581318378448486, "step": 1640 }, { "clip_ratio": 0.0, "completion_length": 302.046875, "epoch": 0.80126953125, "grad_norm": 2.7275559731450985, "kl": 0.068359375, "learning_rate": 7.996826171874999e-07, "loss": 0.0027, "reward": 1.7609490156173706, "reward_std": 0.09491265751421452, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7687614560127258, "step": 1641 }, { "clip_ratio": 0.0, "completion_length": 375.953125, "epoch": 0.8017578125, "grad_norm": 1.3559434809235391, "kl": 0.05126953125, "learning_rate": 7.995605468749999e-07, "loss": 0.002, "reward": 1.7670413851737976, "reward_std": 0.046386873349547386, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7670413553714752, "step": 1642 }, { "clip_ratio": 0.0, "completion_length": 307.6484375, "epoch": 0.80224609375, "grad_norm": 0.9556319282815477, "kl": 0.060302734375, "learning_rate": 7.994384765625e-07, "loss": 0.0024, "reward": 1.7157460451126099, "reward_std": 0.11387444660067558, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7548085451126099, "step": 1643 }, { "clip_ratio": 0.0, "completion_length": 327.09375, "epoch": 0.802734375, "grad_norm": 1.7945424457176717, "kl": 0.0653076171875, "learning_rate": 7.9931640625e-07, "loss": 0.0026, "reward": 1.6921892762184143, "reward_std": 0.18082339316606522, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7156267762184143, "step": 1644 }, { "clip_ratio": 0.0, "completion_length": 227.6015625, "epoch": 0.80322265625, "grad_norm": 1.75248051233542, "kl": 0.0631103515625, "learning_rate": 7.991943359375e-07, "loss": 0.0025, "reward": 1.728013813495636, "reward_std": 0.12820342928171158, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.751451313495636, "step": 1645 }, { "clip_ratio": 0.0, "completion_length": 335.4140625, "epoch": 0.8037109375, "grad_norm": 1.5536038507658307, "kl": 0.054443359375, "learning_rate": 7.99072265625e-07, "loss": 0.0022, "reward": 1.7276506423950195, "reward_std": 0.08504182286560535, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7354631721973419, "step": 1646 }, { "clip_ratio": 0.0, "completion_length": 345.8671875, "epoch": 0.80419921875, "grad_norm": 1.3916694913938594, "kl": 0.056884765625, "learning_rate": 7.989501953125e-07, "loss": 0.0023, "reward": 1.7095491290092468, "reward_std": 0.12051964923739433, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7329866290092468, "step": 1647 }, { "clip_ratio": 0.0, "completion_length": 213.9375, "epoch": 0.8046875, "grad_norm": 3.2993635236882883, "kl": 0.0543212890625, "learning_rate": 7.988281249999999e-07, "loss": 0.0022, "reward": 1.8719586730003357, "reward_std": 0.06403150595724583, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8797712028026581, "step": 1648 }, { "clip_ratio": 0.0, "completion_length": 251.46875, "epoch": 0.80517578125, "grad_norm": 1.5159143122250482, "kl": 0.0556640625, "learning_rate": 7.987060546874999e-07, "loss": 0.0022, "reward": 1.6899768710136414, "reward_std": 0.026911514345556498, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6899769306182861, "step": 1649 }, { "clip_ratio": 0.0, "completion_length": 275.15625, "epoch": 0.8056640625, "grad_norm": 1.7764803050121032, "kl": 0.0616455078125, "learning_rate": 7.98583984375e-07, "loss": 0.0025, "reward": 1.7939913868904114, "reward_std": 0.08574027381837368, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8018038868904114, "step": 1650 }, { "clip_ratio": 0.0, "completion_length": 352.875, "epoch": 0.80615234375, "grad_norm": 1.7844460075537483, "kl": 0.0601806640625, "learning_rate": 7.984619140625e-07, "loss": 0.0024, "reward": 1.7695591449737549, "reward_std": 0.07760765310376883, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7773716747760773, "step": 1651 }, { "clip_ratio": 0.0, "completion_length": 338.5078125, "epoch": 0.806640625, "grad_norm": 0.8825730937574552, "kl": 0.04833984375, "learning_rate": 7.9833984375e-07, "loss": 0.0019, "reward": 1.680052638053894, "reward_std": 0.056173376739025116, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.680052638053894, "step": 1652 }, { "clip_ratio": 0.0, "completion_length": 268.84375, "epoch": 0.80712890625, "grad_norm": 1.582841548895619, "kl": 0.0606689453125, "learning_rate": 7.982177734375e-07, "loss": 0.0024, "reward": 1.8114255666732788, "reward_std": 0.0704609714448452, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8114255368709564, "step": 1653 }, { "clip_ratio": 0.0, "completion_length": 286.46875, "epoch": 0.8076171875, "grad_norm": 3.246295454745955, "kl": 0.0618896484375, "learning_rate": 7.980957031249999e-07, "loss": 0.0025, "reward": 1.7257680296897888, "reward_std": 0.097720542922616, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7413930892944336, "step": 1654 }, { "clip_ratio": 0.0, "completion_length": 210.828125, "epoch": 0.80810546875, "grad_norm": 8.02326668523963, "kl": 0.06982421875, "learning_rate": 7.979736328124999e-07, "loss": 0.0028, "reward": 1.832155466079712, "reward_std": 0.06834479048848152, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8321554660797119, "step": 1655 }, { "clip_ratio": 0.0, "completion_length": 299.4609375, "epoch": 0.80859375, "grad_norm": 1.8274593522243785, "kl": 0.0556640625, "learning_rate": 7.978515624999999e-07, "loss": 0.0022, "reward": 1.7873517274856567, "reward_std": 0.030621130019426346, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7873516976833344, "step": 1656 }, { "clip_ratio": 0.0, "completion_length": 264.140625, "epoch": 0.80908203125, "grad_norm": 1.4152329913916, "kl": 0.0572509765625, "learning_rate": 7.977294921875e-07, "loss": 0.0023, "reward": 1.7179552912712097, "reward_std": 0.09283644892275333, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7179553210735321, "step": 1657 }, { "clip_ratio": 0.0, "completion_length": 292.5078125, "epoch": 0.8095703125, "grad_norm": 1.913575589493602, "kl": 0.07080078125, "learning_rate": 7.97607421875e-07, "loss": 0.0028, "reward": 1.7112517356872559, "reward_std": 0.08975563384592533, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7190642654895782, "step": 1658 }, { "clip_ratio": 0.0, "completion_length": 310.5703125, "epoch": 0.81005859375, "grad_norm": 2.458378061669866, "kl": 0.0548095703125, "learning_rate": 7.974853515625e-07, "loss": 0.0022, "reward": 1.7066927552223206, "reward_std": 0.0858432799577713, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7066927552223206, "step": 1659 }, { "clip_ratio": 0.0, "completion_length": 337.359375, "epoch": 0.810546875, "grad_norm": 17.6320865532267, "kl": 0.0396728515625, "learning_rate": 7.9736328125e-07, "loss": 0.0016, "reward": 1.7677738666534424, "reward_std": 0.04016917198896408, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7677737772464752, "step": 1660 }, { "clip_ratio": 0.0, "completion_length": 324.046875, "epoch": 0.81103515625, "grad_norm": 2.7897545470605953, "kl": 0.0859375, "learning_rate": 7.972412109374999e-07, "loss": 0.0034, "reward": 1.7481929063796997, "reward_std": 0.1298337448388338, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7560054063796997, "step": 1661 }, { "clip_ratio": 0.0, "completion_length": 293.28125, "epoch": 0.8115234375, "grad_norm": 1.3955624722306827, "kl": 0.0496826171875, "learning_rate": 7.971191406249999e-07, "loss": 0.002, "reward": 1.782720685005188, "reward_std": 0.04922756180167198, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.782720685005188, "step": 1662 }, { "clip_ratio": 0.0, "completion_length": 250.265625, "epoch": 0.81201171875, "grad_norm": 2.3186462742034055, "kl": 0.076171875, "learning_rate": 7.969970703125e-07, "loss": 0.003, "reward": 1.715933918952942, "reward_std": 0.09554797038435936, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7159339785575867, "step": 1663 }, { "clip_ratio": 0.0, "completion_length": 331.203125, "epoch": 0.8125, "grad_norm": 1.3672621736408754, "kl": 0.0516357421875, "learning_rate": 7.96875e-07, "loss": 0.0021, "reward": 1.6866209506988525, "reward_std": 0.0732644684612751, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7022460103034973, "step": 1664 }, { "clip_ratio": 0.0, "completion_length": 305.0390625, "epoch": 0.81298828125, "grad_norm": 2.4881291764185476, "kl": 0.053466796875, "learning_rate": 7.967529296875e-07, "loss": 0.0021, "reward": 1.7344902157783508, "reward_std": 0.056121040135622025, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7344902157783508, "step": 1665 }, { "clip_ratio": 0.0, "completion_length": 234.109375, "epoch": 0.8134765625, "grad_norm": 1.9066576061889415, "kl": 0.056396484375, "learning_rate": 7.96630859375e-07, "loss": 0.0023, "reward": 1.777056872844696, "reward_std": 0.10177679359912872, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7770569026470184, "step": 1666 }, { "clip_ratio": 0.0, "completion_length": 295.15625, "epoch": 0.81396484375, "grad_norm": 5.827878805206846, "kl": 0.05078125, "learning_rate": 7.965087890624999e-07, "loss": 0.002, "reward": 1.8067973852157593, "reward_std": 0.0667799562215805, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8067973554134369, "step": 1667 }, { "clip_ratio": 0.0, "completion_length": 365.53125, "epoch": 0.814453125, "grad_norm": 1.8284939496819252, "kl": 0.0498046875, "learning_rate": 7.963867187499999e-07, "loss": 0.002, "reward": 1.7949933409690857, "reward_std": 0.12090729176998138, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8028059005737305, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 337.7578125, "epoch": 0.81494140625, "grad_norm": 1.7262715269151432, "kl": 0.060791015625, "learning_rate": 7.962646484374999e-07, "loss": 0.0024, "reward": 1.7913283109664917, "reward_std": 0.09738441929221153, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7991408109664917, "step": 1669 }, { "clip_ratio": 0.0, "completion_length": 323.7421875, "epoch": 0.8154296875, "grad_norm": 3.374343895722311, "kl": 0.04833984375, "learning_rate": 7.96142578125e-07, "loss": 0.0019, "reward": 1.7700502276420593, "reward_std": 0.11497660167515278, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7934877574443817, "step": 1670 }, { "clip_ratio": 0.0, "completion_length": 353.046875, "epoch": 0.81591796875, "grad_norm": 8.785283536535555, "kl": 0.053466796875, "learning_rate": 7.960205078125e-07, "loss": 0.0021, "reward": 1.803626537322998, "reward_std": 0.04715009219944477, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8036265671253204, "step": 1671 }, { "clip_ratio": 0.0, "completion_length": 363.1796875, "epoch": 0.81640625, "grad_norm": 3.7883503464800756, "kl": 0.05322265625, "learning_rate": 7.958984375e-07, "loss": 0.0021, "reward": 1.789641559123993, "reward_std": 0.07576981373131275, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7974540293216705, "step": 1672 }, { "clip_ratio": 0.0, "completion_length": 293.4296875, "epoch": 0.81689453125, "grad_norm": 2.623805320468466, "kl": 0.05810546875, "learning_rate": 7.957763671875e-07, "loss": 0.0023, "reward": 1.6763262748718262, "reward_std": 0.07890587951987982, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6763262748718262, "step": 1673 }, { "clip_ratio": 0.0, "completion_length": 342.6484375, "epoch": 0.8173828125, "grad_norm": 1.265922144311732, "kl": 0.0582275390625, "learning_rate": 7.956542968749999e-07, "loss": 0.0023, "reward": 1.8311368823051453, "reward_std": 0.05044192261993885, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.83113694190979, "step": 1674 }, { "clip_ratio": 0.0, "completion_length": 269.3828125, "epoch": 0.81787109375, "grad_norm": 1.1222356683541728, "kl": 0.0572509765625, "learning_rate": 7.955322265624999e-07, "loss": 0.0023, "reward": 1.777391493320465, "reward_std": 0.028398778289556503, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7773914933204651, "step": 1675 }, { "clip_ratio": 0.0, "completion_length": 195.671875, "epoch": 0.818359375, "grad_norm": 1.3969206681569204, "kl": 0.0718994140625, "learning_rate": 7.9541015625e-07, "loss": 0.0029, "reward": 1.7802749872207642, "reward_std": 0.03433122206479311, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7802750468254089, "step": 1676 }, { "clip_ratio": 0.0, "completion_length": 318.2421875, "epoch": 0.81884765625, "grad_norm": 1.7764370939806322, "kl": 0.0650634765625, "learning_rate": 7.952880859375e-07, "loss": 0.0026, "reward": 1.754819393157959, "reward_std": 0.05870789662003517, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7548194527626038, "step": 1677 }, { "clip_ratio": 0.0, "completion_length": 339.0078125, "epoch": 0.8193359375, "grad_norm": 3.8365527733566775, "kl": 0.0601806640625, "learning_rate": 7.95166015625e-07, "loss": 0.0024, "reward": 1.7233573198318481, "reward_std": 0.07569370232522488, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7233573496341705, "step": 1678 }, { "clip_ratio": 0.0, "completion_length": 338.8203125, "epoch": 0.81982421875, "grad_norm": 0.9365014960349338, "kl": 0.0455322265625, "learning_rate": 7.950439453125e-07, "loss": 0.0018, "reward": 1.6805170774459839, "reward_std": 0.10303526744246483, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6961420774459839, "step": 1679 }, { "clip_ratio": 0.0, "completion_length": 286.375, "epoch": 0.8203125, "grad_norm": 2.7631417410653833, "kl": 0.0693359375, "learning_rate": 7.949218749999999e-07, "loss": 0.0028, "reward": 1.8013280034065247, "reward_std": 0.04317835159599781, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8013280034065247, "step": 1680 }, { "clip_ratio": 0.0, "completion_length": 361.4921875, "epoch": 0.82080078125, "grad_norm": 1.7477349200279404, "kl": 0.05322265625, "learning_rate": 7.947998046874999e-07, "loss": 0.0021, "reward": 1.7636698484420776, "reward_std": 0.08129507303237915, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7636699080467224, "step": 1681 }, { "clip_ratio": 0.0, "completion_length": 304.2890625, "epoch": 0.8212890625, "grad_norm": 42.813192741926215, "kl": 0.0697021484375, "learning_rate": 7.946777343749999e-07, "loss": 0.0028, "reward": 1.707559585571289, "reward_std": 0.04496626928448677, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7075595855712891, "step": 1682 }, { "clip_ratio": 0.0, "completion_length": 306.2734375, "epoch": 0.82177734375, "grad_norm": 0.9238209123448747, "kl": 0.061279296875, "learning_rate": 7.945556640625e-07, "loss": 0.0024, "reward": 1.7082937955856323, "reward_std": 0.08139174059033394, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7161062955856323, "step": 1683 }, { "clip_ratio": 0.0, "completion_length": 284.890625, "epoch": 0.822265625, "grad_norm": 1.8243450346823835, "kl": 0.0543212890625, "learning_rate": 7.9443359375e-07, "loss": 0.0022, "reward": 1.7618046402931213, "reward_std": 0.08494714740663767, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7696171402931213, "step": 1684 }, { "clip_ratio": 0.0, "completion_length": 331.28125, "epoch": 0.82275390625, "grad_norm": 0.9658134449963954, "kl": 0.0517578125, "learning_rate": 7.943115234375e-07, "loss": 0.0021, "reward": 1.8145250082015991, "reward_std": 0.0333581417798996, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8145250082015991, "step": 1685 }, { "clip_ratio": 0.0, "completion_length": 333.1171875, "epoch": 0.8232421875, "grad_norm": 3.3053961272613166, "kl": 0.0567626953125, "learning_rate": 7.94189453125e-07, "loss": 0.0023, "reward": 1.7290484309196472, "reward_std": 0.07806419394910336, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.736860990524292, "step": 1686 }, { "clip_ratio": 0.0, "completion_length": 253.3828125, "epoch": 0.82373046875, "grad_norm": 2.2314315017058552, "kl": 0.0634765625, "learning_rate": 7.940673828124999e-07, "loss": 0.0025, "reward": 1.694116473197937, "reward_std": 0.09638424962759018, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6941164433956146, "step": 1687 }, { "clip_ratio": 0.0, "completion_length": 250.6484375, "epoch": 0.82421875, "grad_norm": 28.71281793769175, "kl": 0.072509765625, "learning_rate": 7.939453124999999e-07, "loss": 0.0029, "reward": 1.6306800842285156, "reward_std": 0.06786506250500679, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6306800842285156, "step": 1688 }, { "clip_ratio": 0.0, "completion_length": 308.7734375, "epoch": 0.82470703125, "grad_norm": 1.81316275312339, "kl": 0.062255859375, "learning_rate": 7.938232421875e-07, "loss": 0.0025, "reward": 1.754611313343048, "reward_std": 0.04153428506106138, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7546113431453705, "step": 1689 }, { "clip_ratio": 0.0, "completion_length": 307.328125, "epoch": 0.8251953125, "grad_norm": 1.230137752824096, "kl": 0.0703125, "learning_rate": 7.93701171875e-07, "loss": 0.0028, "reward": 1.7025874853134155, "reward_std": 0.08435030654072762, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7025875449180603, "step": 1690 }, { "clip_ratio": 0.0, "completion_length": 253.2109375, "epoch": 0.82568359375, "grad_norm": 3.3713817064505265, "kl": 0.072509765625, "learning_rate": 7.935791015625e-07, "loss": 0.0029, "reward": 1.6434346437454224, "reward_std": 0.04744470492005348, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6434346735477448, "step": 1691 }, { "clip_ratio": 0.0, "completion_length": 261.234375, "epoch": 0.826171875, "grad_norm": 1.7471049654231616, "kl": 0.052978515625, "learning_rate": 7.9345703125e-07, "loss": 0.0021, "reward": 1.7282820343971252, "reward_std": 0.08323949202895164, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.73609459400177, "step": 1692 }, { "clip_ratio": 0.0, "completion_length": 304.8046875, "epoch": 0.82666015625, "grad_norm": 1.8268182609321926, "kl": 0.0521240234375, "learning_rate": 7.933349609375e-07, "loss": 0.0021, "reward": 1.845442295074463, "reward_std": 0.08247396722435951, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8454422950744629, "step": 1693 }, { "clip_ratio": 0.0, "completion_length": 272.28125, "epoch": 0.8271484375, "grad_norm": 2.2899500423574617, "kl": 0.0655517578125, "learning_rate": 7.932128906249999e-07, "loss": 0.0026, "reward": 1.7103111743927002, "reward_std": 0.018862903118133545, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7103111147880554, "step": 1694 }, { "clip_ratio": 0.0, "completion_length": 280.203125, "epoch": 0.82763671875, "grad_norm": 0.824676866426935, "kl": 0.0635986328125, "learning_rate": 7.930908203124999e-07, "loss": 0.0025, "reward": 1.7673900723457336, "reward_std": 0.05198059044778347, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7673900127410889, "step": 1695 }, { "clip_ratio": 0.0, "completion_length": 313.6796875, "epoch": 0.828125, "grad_norm": 1.5355026194431203, "kl": 0.066650390625, "learning_rate": 7.9296875e-07, "loss": 0.0027, "reward": 1.683157503604889, "reward_std": 0.14624864608049393, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6987824440002441, "step": 1696 }, { "clip_ratio": 0.0, "completion_length": 294.890625, "epoch": 0.82861328125, "grad_norm": 2.0932753654505145, "kl": 0.0615234375, "learning_rate": 7.928466796875e-07, "loss": 0.0025, "reward": 1.7546579837799072, "reward_std": 0.048308661207556725, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7546580135822296, "step": 1697 }, { "clip_ratio": 0.0, "completion_length": 300.875, "epoch": 0.8291015625, "grad_norm": 1.4463454973748548, "kl": 0.0565185546875, "learning_rate": 7.92724609375e-07, "loss": 0.0023, "reward": 1.8585106134414673, "reward_std": 0.07304185070097446, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8585106730461121, "step": 1698 }, { "clip_ratio": 0.0, "completion_length": 285.109375, "epoch": 0.82958984375, "grad_norm": 2.8574585912703454, "kl": 0.052734375, "learning_rate": 7.926025390625e-07, "loss": 0.0021, "reward": 1.8252478241920471, "reward_std": 0.06224694475531578, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8252477645874023, "step": 1699 }, { "clip_ratio": 0.0, "completion_length": 309.15625, "epoch": 0.830078125, "grad_norm": 3.057914572093619, "kl": 0.047119140625, "learning_rate": 7.924804687499999e-07, "loss": 0.0019, "reward": 1.7930091619491577, "reward_std": 0.0563307236880064, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7930091023445129, "step": 1700 }, { "clip_ratio": 0.0, "completion_length": 249.3984375, "epoch": 0.83056640625, "grad_norm": 4.684114511035403, "kl": 0.078125, "learning_rate": 7.923583984374999e-07, "loss": 0.0031, "reward": 1.716322124004364, "reward_std": 0.10844194889068604, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7163220942020416, "step": 1701 }, { "clip_ratio": 0.0, "completion_length": 246.4140625, "epoch": 0.8310546875, "grad_norm": 0.8021038944890889, "kl": 0.053466796875, "learning_rate": 7.92236328125e-07, "loss": 0.0021, "reward": 1.882490634918213, "reward_std": 0.033562688156962395, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8824906647205353, "step": 1702 }, { "clip_ratio": 0.0, "completion_length": 321.125, "epoch": 0.83154296875, "grad_norm": 1.6259472943086517, "kl": 0.0560302734375, "learning_rate": 7.921142578125e-07, "loss": 0.0022, "reward": 1.7306804060935974, "reward_std": 0.06768567860126495, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.730680376291275, "step": 1703 }, { "clip_ratio": 0.0, "completion_length": 302.0625, "epoch": 0.83203125, "grad_norm": 1.5177838768420855, "kl": 0.0523681640625, "learning_rate": 7.919921875e-07, "loss": 0.0021, "reward": 1.7253316640853882, "reward_std": 0.08401273377239704, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7331441044807434, "step": 1704 }, { "clip_ratio": 0.0, "completion_length": 261.3203125, "epoch": 0.83251953125, "grad_norm": 3.735972907900447, "kl": 0.0751953125, "learning_rate": 7.918701171875e-07, "loss": 0.003, "reward": 1.7001928091049194, "reward_std": 0.12967666238546371, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7001928091049194, "step": 1705 }, { "clip_ratio": 0.0, "completion_length": 257.890625, "epoch": 0.8330078125, "grad_norm": 1.4428376073481957, "kl": 0.072021484375, "learning_rate": 7.91748046875e-07, "loss": 0.0029, "reward": 1.723404347896576, "reward_std": 0.055715300142765045, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7234043180942535, "step": 1706 }, { "clip_ratio": 0.0, "completion_length": 259.4765625, "epoch": 0.83349609375, "grad_norm": 1.345764950057352, "kl": 0.0791015625, "learning_rate": 7.916259765624999e-07, "loss": 0.0032, "reward": 1.6827195286750793, "reward_std": 0.04551626928150654, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6827195584774017, "step": 1707 }, { "clip_ratio": 0.0, "completion_length": 360.6796875, "epoch": 0.833984375, "grad_norm": 1.3356449765278338, "kl": 0.048828125, "learning_rate": 7.915039062499999e-07, "loss": 0.002, "reward": 1.700093388557434, "reward_std": 0.07758311927318573, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7000934183597565, "step": 1708 }, { "clip_ratio": 0.0, "completion_length": 306.9765625, "epoch": 0.83447265625, "grad_norm": 4.208157265473066, "kl": 0.0489501953125, "learning_rate": 7.913818359375e-07, "loss": 0.002, "reward": 1.6747546792030334, "reward_std": 0.05638587847352028, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6747547090053558, "step": 1709 }, { "clip_ratio": 0.0, "completion_length": 280.0859375, "epoch": 0.8349609375, "grad_norm": 1.1933173641003354, "kl": 0.057861328125, "learning_rate": 7.91259765625e-07, "loss": 0.0023, "reward": 1.814025104045868, "reward_std": 0.031041912734508514, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8140251636505127, "step": 1710 }, { "clip_ratio": 0.0, "completion_length": 336.109375, "epoch": 0.83544921875, "grad_norm": 1.533800629531482, "kl": 0.0560302734375, "learning_rate": 7.911376953125e-07, "loss": 0.0022, "reward": 1.785912573337555, "reward_std": 0.04343899525702, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7859126031398773, "step": 1711 }, { "clip_ratio": 0.0, "completion_length": 302.8203125, "epoch": 0.8359375, "grad_norm": 1.5278482313965254, "kl": 0.05859375, "learning_rate": 7.91015625e-07, "loss": 0.0023, "reward": 1.6969901323318481, "reward_std": 0.05987878702580929, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6969901621341705, "step": 1712 }, { "clip_ratio": 0.0, "completion_length": 305.40625, "epoch": 0.83642578125, "grad_norm": 2.174348828215157, "kl": 0.0538330078125, "learning_rate": 7.908935546874999e-07, "loss": 0.0022, "reward": 1.6450940370559692, "reward_std": 0.13033273071050644, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6529065370559692, "step": 1713 }, { "clip_ratio": 0.0, "completion_length": 339.21875, "epoch": 0.8369140625, "grad_norm": 2.4062321676761877, "kl": 0.0526123046875, "learning_rate": 7.907714843749999e-07, "loss": 0.0021, "reward": 1.8309618830680847, "reward_std": 0.04240616038441658, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8309618830680847, "step": 1714 }, { "clip_ratio": 0.0, "completion_length": 280.1171875, "epoch": 0.83740234375, "grad_norm": 2.977096488750238, "kl": 0.05517578125, "learning_rate": 7.906494140625e-07, "loss": 0.0022, "reward": 1.7772547602653503, "reward_std": 0.07366564497351646, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.777254730463028, "step": 1715 }, { "clip_ratio": 0.0, "completion_length": 328.8203125, "epoch": 0.837890625, "grad_norm": 1.5557139163461535, "kl": 0.0499267578125, "learning_rate": 7.9052734375e-07, "loss": 0.002, "reward": 1.7700649499893188, "reward_std": 0.04422549903392792, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7700649499893188, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 282.1796875, "epoch": 0.83837890625, "grad_norm": 2.338925760005317, "kl": 0.0511474609375, "learning_rate": 7.904052734375e-07, "loss": 0.002, "reward": 1.6442299485206604, "reward_std": 0.09395516850054264, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6520424783229828, "step": 1717 }, { "clip_ratio": 0.0, "completion_length": 186.1328125, "epoch": 0.8388671875, "grad_norm": 1.824044711639337, "kl": 0.0684814453125, "learning_rate": 7.90283203125e-07, "loss": 0.0027, "reward": 1.5608445405960083, "reward_std": 0.09194111078977585, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5608445107936859, "step": 1718 }, { "clip_ratio": 0.0, "completion_length": 256.234375, "epoch": 0.83935546875, "grad_norm": 4.462873508127259, "kl": 0.05078125, "learning_rate": 7.901611328125e-07, "loss": 0.002, "reward": 1.8095470070838928, "reward_std": 0.05928418226540089, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8095470666885376, "step": 1719 }, { "clip_ratio": 0.0, "completion_length": 278.0859375, "epoch": 0.83984375, "grad_norm": 1.7361959977725214, "kl": 0.05419921875, "learning_rate": 7.900390624999999e-07, "loss": 0.0022, "reward": 1.811360478401184, "reward_std": 0.03823063708841801, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8113605082035065, "step": 1720 }, { "clip_ratio": 0.0, "completion_length": 277.4140625, "epoch": 0.84033203125, "grad_norm": 2.1880703343070445, "kl": 0.0506591796875, "learning_rate": 7.899169921874999e-07, "loss": 0.002, "reward": 1.8319576978683472, "reward_std": 0.08859403431415558, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8397701978683472, "step": 1721 }, { "clip_ratio": 0.0, "completion_length": 341.0, "epoch": 0.8408203125, "grad_norm": 1.090543206713142, "kl": 0.0567626953125, "learning_rate": 7.89794921875e-07, "loss": 0.0023, "reward": 1.854802429676056, "reward_std": 0.045012121088802814, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8548024296760559, "step": 1722 }, { "clip_ratio": 0.0, "completion_length": 371.2265625, "epoch": 0.84130859375, "grad_norm": 2.292205761747624, "kl": 0.0625, "learning_rate": 7.896728515625e-07, "loss": 0.0025, "reward": 1.6879829168319702, "reward_std": 0.08614437095820904, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.687982976436615, "step": 1723 }, { "clip_ratio": 0.0, "completion_length": 294.21875, "epoch": 0.841796875, "grad_norm": 2.59542146501557, "kl": 0.058349609375, "learning_rate": 7.8955078125e-07, "loss": 0.0023, "reward": 1.6968461871147156, "reward_std": 0.041891030967235565, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6968461871147156, "step": 1724 }, { "clip_ratio": 0.0, "completion_length": 233.75, "epoch": 0.84228515625, "grad_norm": 1.920538563839018, "kl": 0.0645751953125, "learning_rate": 7.894287109375e-07, "loss": 0.0026, "reward": 1.6629520654678345, "reward_std": 0.02853654231876135, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6629520356655121, "step": 1725 }, { "clip_ratio": 0.0, "completion_length": 307.484375, "epoch": 0.8427734375, "grad_norm": 1.6048607833077935, "kl": 0.047119140625, "learning_rate": 7.893066406249999e-07, "loss": 0.0019, "reward": 1.7042565941810608, "reward_std": 0.07790570706129074, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7198816537857056, "step": 1726 }, { "clip_ratio": 0.0, "completion_length": 236.890625, "epoch": 0.84326171875, "grad_norm": 1.3785295989211974, "kl": 0.06787109375, "learning_rate": 7.891845703124999e-07, "loss": 0.0027, "reward": 1.6507259607315063, "reward_std": 0.03808063454926014, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6507259607315063, "step": 1727 }, { "clip_ratio": 0.0, "completion_length": 267.890625, "epoch": 0.84375, "grad_norm": 12.126077178793409, "kl": 0.0609130859375, "learning_rate": 7.890625e-07, "loss": 0.0024, "reward": 1.749779462814331, "reward_std": 0.04450598731637001, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.749779462814331, "step": 1728 }, { "clip_ratio": 0.0, "completion_length": 288.5859375, "epoch": 0.84423828125, "grad_norm": 1.3126743985604528, "kl": 0.0645751953125, "learning_rate": 7.889404296875e-07, "loss": 0.0026, "reward": 1.726485550403595, "reward_std": 0.04436471126973629, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.726485550403595, "step": 1729 }, { "clip_ratio": 0.0, "completion_length": 305.6953125, "epoch": 0.8447265625, "grad_norm": 7.765244544687918, "kl": 0.0594482421875, "learning_rate": 7.88818359375e-07, "loss": 0.0024, "reward": 1.559519112110138, "reward_std": 0.044531380757689476, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5595191121101379, "step": 1730 }, { "clip_ratio": 0.0, "completion_length": 319.71875, "epoch": 0.84521484375, "grad_norm": 1.507983096496928, "kl": 0.0557861328125, "learning_rate": 7.886962890625e-07, "loss": 0.0022, "reward": 1.634689450263977, "reward_std": 0.1870577111840248, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.689376950263977, "step": 1731 }, { "clip_ratio": 0.0, "completion_length": 308.0234375, "epoch": 0.845703125, "grad_norm": 0.6714636709114369, "kl": 0.05126953125, "learning_rate": 7.8857421875e-07, "loss": 0.002, "reward": 1.9045502543449402, "reward_std": 0.06615402922034264, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.912362813949585, "step": 1732 }, { "clip_ratio": 0.0, "completion_length": 225.84375, "epoch": 0.84619140625, "grad_norm": 2.1809232819190214, "kl": 0.064697265625, "learning_rate": 7.884521484374999e-07, "loss": 0.0026, "reward": 1.6859930753707886, "reward_std": 0.07783360034227371, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6859930753707886, "step": 1733 }, { "clip_ratio": 0.0, "completion_length": 247.8515625, "epoch": 0.8466796875, "grad_norm": 1.498211478873343, "kl": 0.068603515625, "learning_rate": 7.883300781249999e-07, "loss": 0.0027, "reward": 1.773053526878357, "reward_std": 0.050117356702685356, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7730535268783569, "step": 1734 }, { "clip_ratio": 0.0, "completion_length": 258.3359375, "epoch": 0.84716796875, "grad_norm": 3.1690030279370487, "kl": 0.0716552734375, "learning_rate": 7.882080078125e-07, "loss": 0.0029, "reward": 1.632994532585144, "reward_std": 0.06776593998074532, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6329945027828217, "step": 1735 }, { "clip_ratio": 0.0, "completion_length": 399.078125, "epoch": 0.84765625, "grad_norm": 3.785740665297623, "kl": 0.0543212890625, "learning_rate": 7.880859375e-07, "loss": 0.0022, "reward": 1.757651686668396, "reward_std": 0.07453594170510769, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7654642462730408, "step": 1736 }, { "clip_ratio": 0.0, "completion_length": 299.1796875, "epoch": 0.84814453125, "grad_norm": 0.7651577403292609, "kl": 0.0570068359375, "learning_rate": 7.879638671875e-07, "loss": 0.0023, "reward": 1.7877587676048279, "reward_std": 0.055866248439997435, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7955712676048279, "step": 1737 }, { "clip_ratio": 0.0, "completion_length": 398.3203125, "epoch": 0.8486328125, "grad_norm": 0.9326597932338185, "kl": 0.04638671875, "learning_rate": 7.87841796875e-07, "loss": 0.0019, "reward": 1.7322826385498047, "reward_std": 0.06718971207737923, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7322825789451599, "step": 1738 }, { "clip_ratio": 0.0, "completion_length": 339.9296875, "epoch": 0.84912109375, "grad_norm": 1.093915850549881, "kl": 0.051025390625, "learning_rate": 7.877197265624999e-07, "loss": 0.002, "reward": 1.7318394184112549, "reward_std": 0.057227155193686485, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7318393290042877, "step": 1739 }, { "clip_ratio": 0.0, "completion_length": 312.0625, "epoch": 0.849609375, "grad_norm": 4.525173371174026, "kl": 0.077880859375, "learning_rate": 7.875976562499999e-07, "loss": 0.0031, "reward": 1.5092061758041382, "reward_std": 0.12106321007013321, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5248312056064606, "step": 1740 }, { "clip_ratio": 0.0, "completion_length": 280.8046875, "epoch": 0.85009765625, "grad_norm": 2.8086114016486823, "kl": 0.071533203125, "learning_rate": 7.874755859375e-07, "loss": 0.0029, "reward": 1.820200264453888, "reward_std": 0.0870150737464428, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8202002048492432, "step": 1741 }, { "clip_ratio": 0.0, "completion_length": 236.40625, "epoch": 0.8505859375, "grad_norm": 3.4822870448370913, "kl": 0.047607421875, "learning_rate": 7.87353515625e-07, "loss": 0.0019, "reward": 1.8253133296966553, "reward_std": 0.07836700230836868, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8253132998943329, "step": 1742 }, { "clip_ratio": 0.0, "completion_length": 335.7578125, "epoch": 0.85107421875, "grad_norm": 2.114293687003822, "kl": 0.04248046875, "learning_rate": 7.872314453125e-07, "loss": 0.0017, "reward": 1.8304061889648438, "reward_std": 0.12630556523799896, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8460312485694885, "step": 1743 }, { "clip_ratio": 0.0, "completion_length": 293.8671875, "epoch": 0.8515625, "grad_norm": 2.6990671836659295, "kl": 0.05908203125, "learning_rate": 7.87109375e-07, "loss": 0.0024, "reward": 1.7995912432670593, "reward_std": 0.04738871939480305, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7995912134647369, "step": 1744 }, { "clip_ratio": 0.0, "completion_length": 293.3359375, "epoch": 0.85205078125, "grad_norm": 1.14016156478151, "kl": 0.06689453125, "learning_rate": 7.869873046875e-07, "loss": 0.0027, "reward": 1.6955284476280212, "reward_std": 0.13801120221614838, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7345908880233765, "step": 1745 }, { "clip_ratio": 0.0, "completion_length": 396.328125, "epoch": 0.8525390625, "grad_norm": 1.7919196097725265, "kl": 0.0452880859375, "learning_rate": 7.868652343749999e-07, "loss": 0.0018, "reward": 1.7723374962806702, "reward_std": 0.09062624350190163, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7801499664783478, "step": 1746 }, { "clip_ratio": 0.0, "completion_length": 263.28125, "epoch": 0.85302734375, "grad_norm": 2.2144660721415805, "kl": 0.0555419921875, "learning_rate": 7.867431640624999e-07, "loss": 0.0022, "reward": 1.814075231552124, "reward_std": 0.08035072684288025, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.814075231552124, "step": 1747 }, { "clip_ratio": 0.0, "completion_length": 290.34375, "epoch": 0.853515625, "grad_norm": 2.5544601480394498, "kl": 0.0596923828125, "learning_rate": 7.8662109375e-07, "loss": 0.0024, "reward": 1.67475825548172, "reward_std": 0.09878268092870712, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6747583150863647, "step": 1748 }, { "clip_ratio": 0.0, "completion_length": 273.3671875, "epoch": 0.85400390625, "grad_norm": 1.2963780193553962, "kl": 0.0528564453125, "learning_rate": 7.864990234375e-07, "loss": 0.0021, "reward": 1.7922492623329163, "reward_std": 0.04872659081593156, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7922492027282715, "step": 1749 }, { "clip_ratio": 0.0, "completion_length": 400.25, "epoch": 0.8544921875, "grad_norm": 2.338549025732979, "kl": 0.0543212890625, "learning_rate": 7.86376953125e-07, "loss": 0.0022, "reward": 1.772305965423584, "reward_std": 0.07593853399157524, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.780118465423584, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 365.546875, "epoch": 0.85498046875, "grad_norm": 1.4071606737568558, "kl": 0.04296875, "learning_rate": 7.862548828125e-07, "loss": 0.0017, "reward": 1.8112922310829163, "reward_std": 0.13108721747994423, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8425421714782715, "step": 1751 }, { "clip_ratio": 0.0, "completion_length": 324.6328125, "epoch": 0.85546875, "grad_norm": 3.826688951216383, "kl": 0.0850830078125, "learning_rate": 7.861328124999999e-07, "loss": 0.0034, "reward": 1.6129182577133179, "reward_std": 0.13975085318088531, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6285432279109955, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 312.6328125, "epoch": 0.85595703125, "grad_norm": 0.9707651176685911, "kl": 0.0546875, "learning_rate": 7.860107421874999e-07, "loss": 0.0022, "reward": 1.7708771228790283, "reward_std": 0.022707084193825722, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7708771228790283, "step": 1753 }, { "clip_ratio": 0.0, "completion_length": 362.515625, "epoch": 0.8564453125, "grad_norm": 1.4575860899322022, "kl": 0.0499267578125, "learning_rate": 7.858886718749999e-07, "loss": 0.002, "reward": 1.7548741698265076, "reward_std": 0.12130584567785263, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7704991102218628, "step": 1754 }, { "clip_ratio": 0.0, "completion_length": 387.8203125, "epoch": 0.85693359375, "grad_norm": 1.7544323097135777, "kl": 0.0548095703125, "learning_rate": 7.857666015625e-07, "loss": 0.0022, "reward": 1.6898673176765442, "reward_std": 0.1721840798854828, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7054923176765442, "step": 1755 }, { "clip_ratio": 0.0, "completion_length": 380.5234375, "epoch": 0.857421875, "grad_norm": 0.8604056131796117, "kl": 0.050537109375, "learning_rate": 7.8564453125e-07, "loss": 0.002, "reward": 1.7485257983207703, "reward_std": 0.15745490044355392, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7875882983207703, "step": 1756 }, { "clip_ratio": 0.0, "completion_length": 366.8359375, "epoch": 0.85791015625, "grad_norm": 1.4416503344043057, "kl": 0.057861328125, "learning_rate": 7.855224609375e-07, "loss": 0.0023, "reward": 1.7979487776756287, "reward_std": 0.09817294403910637, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7979487776756287, "step": 1757 }, { "clip_ratio": 0.0, "completion_length": 318.09375, "epoch": 0.8583984375, "grad_norm": 5.069045278301706, "kl": 0.0528564453125, "learning_rate": 7.85400390625e-07, "loss": 0.0021, "reward": 1.7911220788955688, "reward_std": 0.10761953145265579, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8067470788955688, "step": 1758 }, { "clip_ratio": 0.0, "completion_length": 331.5, "epoch": 0.85888671875, "grad_norm": 1.5478469273798983, "kl": 0.0439453125, "learning_rate": 7.852783203124999e-07, "loss": 0.0018, "reward": 1.7176623344421387, "reward_std": 0.11607952415943146, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7645373642444611, "step": 1759 }, { "clip_ratio": 0.0, "completion_length": 418.921875, "epoch": 0.859375, "grad_norm": 0.8812677172192789, "kl": 0.056640625, "learning_rate": 7.851562499999999e-07, "loss": 0.0023, "reward": 1.5803175568580627, "reward_std": 0.12807496264576912, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6115675568580627, "step": 1760 }, { "clip_ratio": 0.0, "completion_length": 278.8515625, "epoch": 0.85986328125, "grad_norm": 3.5600117966088054, "kl": 0.0543212890625, "learning_rate": 7.850341796875e-07, "loss": 0.0022, "reward": 1.7184030413627625, "reward_std": 0.07983948290348053, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7262155115604401, "step": 1761 }, { "clip_ratio": 0.0, "completion_length": 254.03125, "epoch": 0.8603515625, "grad_norm": 9.189097579338886, "kl": 0.055419921875, "learning_rate": 7.84912109375e-07, "loss": 0.0022, "reward": 1.691203534603119, "reward_std": 0.07379813119769096, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6912035048007965, "step": 1762 }, { "clip_ratio": 0.0, "completion_length": 263.8828125, "epoch": 0.86083984375, "grad_norm": 1.6292809358924043, "kl": 0.062255859375, "learning_rate": 7.847900390625e-07, "loss": 0.0025, "reward": 1.616748571395874, "reward_std": 0.07066140696406364, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6245611011981964, "step": 1763 }, { "clip_ratio": 0.0, "completion_length": 239.6328125, "epoch": 0.861328125, "grad_norm": 8.127274020590354, "kl": 0.08251953125, "learning_rate": 7.8466796875e-07, "loss": 0.0033, "reward": 1.6578654646873474, "reward_std": 0.11017253622412682, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6578654944896698, "step": 1764 }, { "clip_ratio": 0.0, "completion_length": 331.4453125, "epoch": 0.86181640625, "grad_norm": 1.8627529528124558, "kl": 0.05224609375, "learning_rate": 7.845458984374999e-07, "loss": 0.0021, "reward": 1.793116271495819, "reward_std": 0.10687560588121414, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8009287714958191, "step": 1765 }, { "clip_ratio": 0.0, "completion_length": 350.15625, "epoch": 0.8623046875, "grad_norm": 5.016437160343661, "kl": 0.05029296875, "learning_rate": 7.844238281249999e-07, "loss": 0.002, "reward": 1.758280873298645, "reward_std": 0.04120416380465031, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.758280873298645, "step": 1766 }, { "clip_ratio": 0.0, "completion_length": 415.0859375, "epoch": 0.86279296875, "grad_norm": 2.458305708230358, "kl": 0.0460205078125, "learning_rate": 7.843017578124999e-07, "loss": 0.0018, "reward": 1.6915509700775146, "reward_std": 0.14069624990224838, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7071759104728699, "step": 1767 }, { "clip_ratio": 0.0, "completion_length": 294.53125, "epoch": 0.86328125, "grad_norm": 1.4127014544761989, "kl": 0.067138671875, "learning_rate": 7.841796875e-07, "loss": 0.0027, "reward": 1.727443516254425, "reward_std": 0.09978067316114902, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.735256016254425, "step": 1768 }, { "clip_ratio": 0.0, "completion_length": 305.046875, "epoch": 0.86376953125, "grad_norm": 1.3815442374612499, "kl": 0.0565185546875, "learning_rate": 7.840576171875e-07, "loss": 0.0023, "reward": 1.7928959131240845, "reward_std": 0.08073288947343826, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7928958535194397, "step": 1769 }, { "clip_ratio": 0.0, "completion_length": 288.1953125, "epoch": 0.8642578125, "grad_norm": 4.466680027866366, "kl": 0.054931640625, "learning_rate": 7.83935546875e-07, "loss": 0.0022, "reward": 1.7177514433860779, "reward_std": 0.08392149582505226, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7177514135837555, "step": 1770 }, { "clip_ratio": 0.0, "completion_length": 344.921875, "epoch": 0.86474609375, "grad_norm": 1.1403297589321426, "kl": 0.0496826171875, "learning_rate": 7.838134765625e-07, "loss": 0.002, "reward": 1.70395165681839, "reward_std": 0.04686661344021559, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7039515972137451, "step": 1771 }, { "clip_ratio": 0.0, "completion_length": 275.1953125, "epoch": 0.865234375, "grad_norm": 10.706867765852694, "kl": 0.05419921875, "learning_rate": 7.836914062499999e-07, "loss": 0.0022, "reward": 1.7995309829711914, "reward_std": 0.03605970740318298, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.799530953168869, "step": 1772 }, { "clip_ratio": 0.0, "completion_length": 391.9609375, "epoch": 0.86572265625, "grad_norm": 3.3271446204369073, "kl": 0.051025390625, "learning_rate": 7.835693359374999e-07, "loss": 0.002, "reward": 1.6900931596755981, "reward_std": 0.18629964627325535, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7369681894779205, "step": 1773 }, { "clip_ratio": 0.0, "completion_length": 383.296875, "epoch": 0.8662109375, "grad_norm": 1.834654441326854, "kl": 0.053466796875, "learning_rate": 7.83447265625e-07, "loss": 0.0021, "reward": 1.714508295059204, "reward_std": 0.11370455846190453, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7613833248615265, "step": 1774 }, { "clip_ratio": 0.0, "completion_length": 271.765625, "epoch": 0.86669921875, "grad_norm": 1.879783754964238, "kl": 0.05615234375, "learning_rate": 7.833251953125e-07, "loss": 0.0022, "reward": 1.630328118801117, "reward_std": 0.08714995346963406, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6381406188011169, "step": 1775 }, { "clip_ratio": 0.0, "completion_length": 277.265625, "epoch": 0.8671875, "grad_norm": 2.054382936637815, "kl": 0.0517578125, "learning_rate": 7.83203125e-07, "loss": 0.0021, "reward": 1.6842593550682068, "reward_std": 0.028140094596892595, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6842593252658844, "step": 1776 }, { "clip_ratio": 0.0, "completion_length": 249.4609375, "epoch": 0.86767578125, "grad_norm": 1.6929850993037547, "kl": 0.049560546875, "learning_rate": 7.830810546875e-07, "loss": 0.002, "reward": 1.776337742805481, "reward_std": 0.04738312214612961, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.776337742805481, "step": 1777 }, { "clip_ratio": 0.0, "completion_length": 292.15625, "epoch": 0.8681640625, "grad_norm": 1.944714939624994, "kl": 0.046630859375, "learning_rate": 7.829589843749999e-07, "loss": 0.0019, "reward": 1.8062950372695923, "reward_std": 0.0485474169254303, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8062950074672699, "step": 1778 }, { "clip_ratio": 0.0, "completion_length": 286.921875, "epoch": 0.86865234375, "grad_norm": 2.0500856307717155, "kl": 0.0758056640625, "learning_rate": 7.828369140624999e-07, "loss": 0.003, "reward": 1.7174754738807678, "reward_std": 0.05034205690026283, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7174754738807678, "step": 1779 }, { "clip_ratio": 0.0, "completion_length": 307.578125, "epoch": 0.869140625, "grad_norm": 1.8640037636723337, "kl": 0.049560546875, "learning_rate": 7.827148437499999e-07, "loss": 0.002, "reward": 1.6384202241897583, "reward_std": 0.10762511938810349, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6462327837944031, "step": 1780 }, { "clip_ratio": 0.0, "completion_length": 256.34375, "epoch": 0.86962890625, "grad_norm": 3.8282446965440684, "kl": 0.052978515625, "learning_rate": 7.825927734375e-07, "loss": 0.0021, "reward": 1.77534019947052, "reward_std": 0.057393044233322144, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7753402590751648, "step": 1781 }, { "clip_ratio": 0.0, "completion_length": 269.5390625, "epoch": 0.8701171875, "grad_norm": 1.8964741899587358, "kl": 0.0552978515625, "learning_rate": 7.82470703125e-07, "loss": 0.0022, "reward": 1.774406909942627, "reward_std": 0.1195422075688839, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7744069397449493, "step": 1782 }, { "clip_ratio": 0.0, "completion_length": 331.875, "epoch": 0.87060546875, "grad_norm": 1.4724984337179923, "kl": 0.06591796875, "learning_rate": 7.823486328125e-07, "loss": 0.0026, "reward": 1.695317268371582, "reward_std": 0.09004146233201027, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7031297087669373, "step": 1783 }, { "clip_ratio": 0.0, "completion_length": 326.9609375, "epoch": 0.87109375, "grad_norm": 0.6448629109262315, "kl": 0.04736328125, "learning_rate": 7.822265625e-07, "loss": 0.0019, "reward": 1.9076035022735596, "reward_std": 0.01943269930779934, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.907603532075882, "step": 1784 }, { "clip_ratio": 0.0, "completion_length": 257.0234375, "epoch": 0.87158203125, "grad_norm": 3.027195875222421, "kl": 0.0672607421875, "learning_rate": 7.821044921874999e-07, "loss": 0.0027, "reward": 1.689346194267273, "reward_std": 0.08278231136500835, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6893462538719177, "step": 1785 }, { "clip_ratio": 0.0, "completion_length": 308.359375, "epoch": 0.8720703125, "grad_norm": 1.1478726616336268, "kl": 0.0518798828125, "learning_rate": 7.819824218749999e-07, "loss": 0.0021, "reward": 1.7398544549942017, "reward_std": 0.08145036175847054, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7789169549942017, "step": 1786 }, { "clip_ratio": 0.0, "completion_length": 285.3671875, "epoch": 0.87255859375, "grad_norm": 11.542962874754854, "kl": 0.087890625, "learning_rate": 7.818603515625e-07, "loss": 0.0035, "reward": 1.6417620182037354, "reward_std": 0.12737858295440674, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6573870182037354, "step": 1787 }, { "clip_ratio": 0.0, "completion_length": 250.75, "epoch": 0.873046875, "grad_norm": 2.404778691827622, "kl": 0.05078125, "learning_rate": 7.8173828125e-07, "loss": 0.002, "reward": 1.8236736059188843, "reward_std": 0.03153271973133087, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8236735463142395, "step": 1788 }, { "clip_ratio": 0.0, "completion_length": 276.8984375, "epoch": 0.87353515625, "grad_norm": 2.4749194133053596, "kl": 0.0501708984375, "learning_rate": 7.816162109375e-07, "loss": 0.002, "reward": 1.7504101991653442, "reward_std": 0.08443843200802803, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7582226991653442, "step": 1789 }, { "clip_ratio": 0.0, "completion_length": 335.7421875, "epoch": 0.8740234375, "grad_norm": 2.6927007447476257, "kl": 0.0517578125, "learning_rate": 7.81494140625e-07, "loss": 0.0021, "reward": 1.6656638979911804, "reward_std": 0.12951365113258362, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6734763383865356, "step": 1790 }, { "clip_ratio": 0.0, "completion_length": 266.71875, "epoch": 0.87451171875, "grad_norm": 3.271122934848648, "kl": 0.047607421875, "learning_rate": 7.813720703125e-07, "loss": 0.0019, "reward": 1.746010661125183, "reward_std": 0.09499474987387657, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7460106015205383, "step": 1791 }, { "clip_ratio": 0.0, "completion_length": 220.2421875, "epoch": 0.875, "grad_norm": 1.9497103555365891, "kl": 0.076171875, "learning_rate": 7.812499999999999e-07, "loss": 0.003, "reward": 1.683960497379303, "reward_std": 0.07071896642446518, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6839604675769806, "step": 1792 }, { "clip_ratio": 0.0, "completion_length": 307.1484375, "epoch": 0.87548828125, "grad_norm": 4.006011455889697, "kl": 0.06005859375, "learning_rate": 7.811279296874999e-07, "loss": 0.0024, "reward": 1.8330675959587097, "reward_std": 0.023156346287578344, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8330676555633545, "step": 1793 }, { "clip_ratio": 0.0, "completion_length": 242.7265625, "epoch": 0.8759765625, "grad_norm": 25.852237909125545, "kl": 0.062744140625, "learning_rate": 7.81005859375e-07, "loss": 0.0025, "reward": 1.7323620319366455, "reward_std": 0.049556052312254906, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7323620617389679, "step": 1794 }, { "clip_ratio": 0.0, "completion_length": 245.765625, "epoch": 0.87646484375, "grad_norm": 1.5076425145126342, "kl": 0.0516357421875, "learning_rate": 7.808837890625e-07, "loss": 0.0021, "reward": 1.819112241268158, "reward_std": 0.0596193540841341, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8191123008728027, "step": 1795 }, { "clip_ratio": 0.0, "completion_length": 298.5703125, "epoch": 0.876953125, "grad_norm": 3.047457685966905, "kl": 0.054443359375, "learning_rate": 7.8076171875e-07, "loss": 0.0022, "reward": 1.673986792564392, "reward_std": 0.09043450467288494, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6739867627620697, "step": 1796 }, { "clip_ratio": 0.0, "completion_length": 228.609375, "epoch": 0.87744140625, "grad_norm": 1.1587782691548503, "kl": 0.0494384765625, "learning_rate": 7.806396484375e-07, "loss": 0.002, "reward": 1.7295081615447998, "reward_std": 0.06940071284770966, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7295081615447998, "step": 1797 }, { "clip_ratio": 0.0, "completion_length": 272.0703125, "epoch": 0.8779296875, "grad_norm": 4.452290043361228, "kl": 0.0526123046875, "learning_rate": 7.805175781249999e-07, "loss": 0.0021, "reward": 1.8323208689689636, "reward_std": 0.05967606604099274, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8323208391666412, "step": 1798 }, { "clip_ratio": 0.0, "completion_length": 315.703125, "epoch": 0.87841796875, "grad_norm": 25.202792928468167, "kl": 0.05419921875, "learning_rate": 7.803955078124999e-07, "loss": 0.0022, "reward": 1.7258835434913635, "reward_std": 0.09938307851552963, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7258834838867188, "step": 1799 }, { "clip_ratio": 0.0, "completion_length": 319.296875, "epoch": 0.87890625, "grad_norm": 0.6818241842690064, "kl": 0.043701171875, "learning_rate": 7.802734375e-07, "loss": 0.0017, "reward": 1.7057358026504517, "reward_std": 0.05737200379371643, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7057357132434845, "step": 1800 }, { "clip_ratio": 0.0, "completion_length": 278.0, "epoch": 0.87939453125, "grad_norm": 3.693729462628242, "kl": 0.062744140625, "learning_rate": 7.801513671875e-07, "loss": 0.0025, "reward": 1.7509536743164062, "reward_std": 0.04812243953347206, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7509536445140839, "step": 1801 }, { "clip_ratio": 0.0, "completion_length": 241.8984375, "epoch": 0.8798828125, "grad_norm": 1.9779481675448194, "kl": 0.042724609375, "learning_rate": 7.80029296875e-07, "loss": 0.0017, "reward": 1.8703011870384216, "reward_std": 0.03746516443789005, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8703011870384216, "step": 1802 }, { "clip_ratio": 0.0, "completion_length": 268.8984375, "epoch": 0.88037109375, "grad_norm": 2.2905824394928884, "kl": 0.059326171875, "learning_rate": 7.799072265625e-07, "loss": 0.0024, "reward": 1.8006829619407654, "reward_std": 0.0814764704555273, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8006830215454102, "step": 1803 }, { "clip_ratio": 0.0, "completion_length": 242.2265625, "epoch": 0.880859375, "grad_norm": 1.6573366743968407, "kl": 0.0570068359375, "learning_rate": 7.7978515625e-07, "loss": 0.0023, "reward": 1.6909406185150146, "reward_std": 0.0707071777433157, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.690940648317337, "step": 1804 }, { "clip_ratio": 0.0, "completion_length": 268.0625, "epoch": 0.88134765625, "grad_norm": 0.7192785248577067, "kl": 0.051513671875, "learning_rate": 7.796630859374999e-07, "loss": 0.0021, "reward": 1.6431750655174255, "reward_std": 0.06791674718260765, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6431750655174255, "step": 1805 }, { "clip_ratio": 0.0, "completion_length": 279.5703125, "epoch": 0.8818359375, "grad_norm": 0.9208200253080467, "kl": 0.058349609375, "learning_rate": 7.795410156249999e-07, "loss": 0.0023, "reward": 1.709853172302246, "reward_std": 0.11433164775371552, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7098531723022461, "step": 1806 }, { "clip_ratio": 0.0, "completion_length": 223.265625, "epoch": 0.88232421875, "grad_norm": 2.164142841901239, "kl": 0.0660400390625, "learning_rate": 7.794189453125e-07, "loss": 0.0026, "reward": 1.6406881213188171, "reward_std": 0.11811601743102074, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6406880915164948, "step": 1807 }, { "clip_ratio": 0.0, "completion_length": 244.09375, "epoch": 0.8828125, "grad_norm": 15.416769114837617, "kl": 0.072509765625, "learning_rate": 7.79296875e-07, "loss": 0.0029, "reward": 1.8091920614242554, "reward_std": 0.032884467393159866, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.809192031621933, "step": 1808 }, { "clip_ratio": 0.0, "completion_length": 283.3046875, "epoch": 0.88330078125, "grad_norm": 2.391571899624504, "kl": 0.0584716796875, "learning_rate": 7.791748046875e-07, "loss": 0.0023, "reward": 1.8211405277252197, "reward_std": 0.05889258533716202, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.821140468120575, "step": 1809 }, { "clip_ratio": 0.0, "completion_length": 323.2421875, "epoch": 0.8837890625, "grad_norm": 1.7474116859623878, "kl": 0.0506591796875, "learning_rate": 7.79052734375e-07, "loss": 0.002, "reward": 1.7022829055786133, "reward_std": 0.04944469407200813, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7022829353809357, "step": 1810 }, { "clip_ratio": 0.0, "completion_length": 281.1875, "epoch": 0.88427734375, "grad_norm": 5.108476758820766, "kl": 0.0616455078125, "learning_rate": 7.789306640624999e-07, "loss": 0.0025, "reward": 1.65779048204422, "reward_std": 0.11469753831624985, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.65779048204422, "step": 1811 }, { "clip_ratio": 0.0, "completion_length": 316.09375, "epoch": 0.884765625, "grad_norm": 1.4333680344113544, "kl": 0.0460205078125, "learning_rate": 7.788085937499999e-07, "loss": 0.0018, "reward": 1.8354427814483643, "reward_std": 0.10553473606705666, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8354427516460419, "step": 1812 }, { "clip_ratio": 0.0, "completion_length": 201.796875, "epoch": 0.88525390625, "grad_norm": 1.5118729896701895, "kl": 0.0584716796875, "learning_rate": 7.786865234375e-07, "loss": 0.0023, "reward": 1.817187786102295, "reward_std": 0.08914723992347717, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8171877861022949, "step": 1813 }, { "clip_ratio": 0.0, "completion_length": 247.328125, "epoch": 0.8857421875, "grad_norm": 6.425239274241706, "kl": 0.060791015625, "learning_rate": 7.78564453125e-07, "loss": 0.0024, "reward": 1.7643995881080627, "reward_std": 0.0862666517496109, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7643995881080627, "step": 1814 }, { "clip_ratio": 0.0, "completion_length": 314.5390625, "epoch": 0.88623046875, "grad_norm": 6.990205543539001, "kl": 0.07421875, "learning_rate": 7.784423828125e-07, "loss": 0.003, "reward": 1.6783007383346558, "reward_std": 0.08350778743624687, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6783007085323334, "step": 1815 }, { "clip_ratio": 0.0, "completion_length": 246.078125, "epoch": 0.88671875, "grad_norm": 3.51961258176851, "kl": 0.0556640625, "learning_rate": 7.783203125e-07, "loss": 0.0022, "reward": 1.7718433737754822, "reward_std": 0.054395925253629684, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7718433439731598, "step": 1816 }, { "clip_ratio": 0.0, "completion_length": 203.296875, "epoch": 0.88720703125, "grad_norm": 2.1702787043708143, "kl": 0.0628662109375, "learning_rate": 7.781982421875e-07, "loss": 0.0025, "reward": 1.8164880275726318, "reward_std": 0.0386070990934968, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.816488116979599, "step": 1817 }, { "clip_ratio": 0.0, "completion_length": 276.2734375, "epoch": 0.8876953125, "grad_norm": 1.594469695809148, "kl": 0.056640625, "learning_rate": 7.780761718749999e-07, "loss": 0.0023, "reward": 1.6637941598892212, "reward_std": 0.04839322529733181, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6637941598892212, "step": 1818 }, { "clip_ratio": 0.0, "completion_length": 317.59375, "epoch": 0.88818359375, "grad_norm": 1.0269338480997001, "kl": 0.060302734375, "learning_rate": 7.779541015624999e-07, "loss": 0.0024, "reward": 1.8385123014450073, "reward_std": 0.044711560010910034, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8385123014450073, "step": 1819 }, { "clip_ratio": 0.0, "completion_length": 341.6875, "epoch": 0.888671875, "grad_norm": 4.6582457557107615, "kl": 0.0445556640625, "learning_rate": 7.7783203125e-07, "loss": 0.0018, "reward": 1.7252464294433594, "reward_std": 0.08425504341721535, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.733058899641037, "step": 1820 }, { "clip_ratio": 0.0, "completion_length": 278.6640625, "epoch": 0.88916015625, "grad_norm": 4.495257505799833, "kl": 0.0631103515625, "learning_rate": 7.777099609375e-07, "loss": 0.0025, "reward": 1.815511703491211, "reward_std": 0.05697597935795784, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8155117332935333, "step": 1821 }, { "clip_ratio": 0.0, "completion_length": 224.21875, "epoch": 0.8896484375, "grad_norm": 1.5187517576726908, "kl": 0.07177734375, "learning_rate": 7.77587890625e-07, "loss": 0.0029, "reward": 1.7230549454689026, "reward_std": 0.03447245853021741, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7230549454689026, "step": 1822 }, { "clip_ratio": 0.0, "completion_length": 369.4140625, "epoch": 0.89013671875, "grad_norm": 1.5792144097013674, "kl": 0.0489501953125, "learning_rate": 7.774658203125e-07, "loss": 0.002, "reward": 1.6508527398109436, "reward_std": 0.12545301765203476, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6586652100086212, "step": 1823 }, { "clip_ratio": 0.0, "completion_length": 277.2265625, "epoch": 0.890625, "grad_norm": 1.7338004675325442, "kl": 0.0616455078125, "learning_rate": 7.773437499999999e-07, "loss": 0.0025, "reward": 1.6528041362762451, "reward_std": 0.03595791570842266, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6528041362762451, "step": 1824 }, { "clip_ratio": 0.0, "completion_length": 265.875, "epoch": 0.89111328125, "grad_norm": 1.3828966259193087, "kl": 0.0540771484375, "learning_rate": 7.772216796874999e-07, "loss": 0.0022, "reward": 1.6901865601539612, "reward_std": 0.06458355858922005, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6979990303516388, "step": 1825 }, { "clip_ratio": 0.0, "completion_length": 307.0390625, "epoch": 0.8916015625, "grad_norm": 4.53384888949427, "kl": 0.0604248046875, "learning_rate": 7.77099609375e-07, "loss": 0.0024, "reward": 1.7263333797454834, "reward_std": 0.06643800996243954, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7341458201408386, "step": 1826 }, { "clip_ratio": 0.0, "completion_length": 266.4140625, "epoch": 0.89208984375, "grad_norm": 1.7484243028024995, "kl": 0.0562744140625, "learning_rate": 7.769775390625e-07, "loss": 0.0022, "reward": 1.7096668481826782, "reward_std": 0.1043664738535881, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7096668183803558, "step": 1827 }, { "clip_ratio": 0.0, "completion_length": 297.5859375, "epoch": 0.892578125, "grad_norm": 4.9974540266792, "kl": 0.0611572265625, "learning_rate": 7.7685546875e-07, "loss": 0.0025, "reward": 1.6894102096557617, "reward_std": 0.0981958694756031, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6894101500511169, "step": 1828 }, { "clip_ratio": 0.0, "completion_length": 287.0703125, "epoch": 0.89306640625, "grad_norm": 3.5144777679347463, "kl": 0.060546875, "learning_rate": 7.767333984375e-07, "loss": 0.0024, "reward": 1.7617112398147583, "reward_std": 0.09093910502269864, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7695237696170807, "step": 1829 }, { "clip_ratio": 0.0, "completion_length": 416.875, "epoch": 0.8935546875, "grad_norm": 1.5653767539180587, "kl": 0.040771484375, "learning_rate": 7.76611328125e-07, "loss": 0.0016, "reward": 1.8352625370025635, "reward_std": 0.09809044748544693, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8430750966072083, "step": 1830 }, { "clip_ratio": 0.0, "completion_length": 204.125, "epoch": 0.89404296875, "grad_norm": 1.9658571112390546, "kl": 0.0611572265625, "learning_rate": 7.764892578124999e-07, "loss": 0.0024, "reward": 1.862768530845642, "reward_std": 0.025783130899071693, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8627684712409973, "step": 1831 }, { "clip_ratio": 0.0, "completion_length": 383.453125, "epoch": 0.89453125, "grad_norm": 3.184036254441203, "kl": 0.0498046875, "learning_rate": 7.763671874999999e-07, "loss": 0.002, "reward": 1.756974220275879, "reward_std": 0.04832346737384796, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7569742202758789, "step": 1832 }, { "clip_ratio": 0.0, "completion_length": 301.0390625, "epoch": 0.89501953125, "grad_norm": 1.7521194443833326, "kl": 0.0618896484375, "learning_rate": 7.762451171875e-07, "loss": 0.0025, "reward": 1.7110464572906494, "reward_std": 0.07836447097361088, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7110464870929718, "step": 1833 }, { "clip_ratio": 0.0, "completion_length": 307.0859375, "epoch": 0.8955078125, "grad_norm": 11.051729675768927, "kl": 0.0628662109375, "learning_rate": 7.76123046875e-07, "loss": 0.0025, "reward": 1.6928837299346924, "reward_std": 0.09242498874664307, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6928837299346924, "step": 1834 }, { "clip_ratio": 0.0, "completion_length": 293.4765625, "epoch": 0.89599609375, "grad_norm": 1.9611838635137748, "kl": 0.0665283203125, "learning_rate": 7.760009765625e-07, "loss": 0.0027, "reward": 1.7181519269943237, "reward_std": 0.08656962960958481, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7181519567966461, "step": 1835 }, { "clip_ratio": 0.0, "completion_length": 345.765625, "epoch": 0.896484375, "grad_norm": 1.9097853521680372, "kl": 0.063232421875, "learning_rate": 7.7587890625e-07, "loss": 0.0025, "reward": 1.6719039678573608, "reward_std": 0.0817815288901329, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6719039082527161, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 330.1484375, "epoch": 0.89697265625, "grad_norm": 2.040065547317354, "kl": 0.054443359375, "learning_rate": 7.757568359374999e-07, "loss": 0.0022, "reward": 1.8175336122512817, "reward_std": 0.092707434669137, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.825346052646637, "step": 1837 }, { "clip_ratio": 0.0, "completion_length": 246.5703125, "epoch": 0.8974609375, "grad_norm": 2.114483844672761, "kl": 0.06591796875, "learning_rate": 7.756347656249999e-07, "loss": 0.0026, "reward": 1.7319183945655823, "reward_std": 0.047073543071746826, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7319183647632599, "step": 1838 }, { "clip_ratio": 0.0, "completion_length": 321.71875, "epoch": 0.89794921875, "grad_norm": 0.7428215101894872, "kl": 0.0401611328125, "learning_rate": 7.755126953125e-07, "loss": 0.0016, "reward": 1.7587011456489563, "reward_std": 0.026800723746418953, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7587011754512787, "step": 1839 }, { "clip_ratio": 0.0, "completion_length": 339.921875, "epoch": 0.8984375, "grad_norm": 0.7629811197679781, "kl": 0.0484619140625, "learning_rate": 7.75390625e-07, "loss": 0.0019, "reward": 1.6852461099624634, "reward_std": 0.08999980986118317, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6930586099624634, "step": 1840 }, { "clip_ratio": 0.0, "completion_length": 361.3984375, "epoch": 0.89892578125, "grad_norm": 1.5555072595949755, "kl": 0.0543212890625, "learning_rate": 7.752685546875e-07, "loss": 0.0022, "reward": 1.7511460781097412, "reward_std": 0.06476838141679764, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7511460781097412, "step": 1841 }, { "clip_ratio": 0.0, "completion_length": 282.75, "epoch": 0.8994140625, "grad_norm": 2.754364916244924, "kl": 0.0587158203125, "learning_rate": 7.75146484375e-07, "loss": 0.0024, "reward": 1.8262133598327637, "reward_std": 0.02120867930352688, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8262133896350861, "step": 1842 }, { "clip_ratio": 0.0, "completion_length": 319.7421875, "epoch": 0.89990234375, "grad_norm": 24.641127364889815, "kl": 0.0570068359375, "learning_rate": 7.750244140625e-07, "loss": 0.0023, "reward": 1.6971803903579712, "reward_std": 0.05197112262248993, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6971803903579712, "step": 1843 }, { "clip_ratio": 0.0, "completion_length": 247.484375, "epoch": 0.900390625, "grad_norm": 2.1238794464311006, "kl": 0.06982421875, "learning_rate": 7.749023437499999e-07, "loss": 0.0028, "reward": 1.6669594049453735, "reward_std": 0.04614550992846489, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6669594645500183, "step": 1844 }, { "clip_ratio": 0.0, "completion_length": 341.78125, "epoch": 0.90087890625, "grad_norm": 2.071912757412851, "kl": 0.0523681640625, "learning_rate": 7.747802734374999e-07, "loss": 0.0021, "reward": 1.6606204509735107, "reward_std": 0.08798486739397049, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6684330701828003, "step": 1845 }, { "clip_ratio": 0.0, "completion_length": 334.546875, "epoch": 0.9013671875, "grad_norm": 3.3768722950453633, "kl": 0.050048828125, "learning_rate": 7.74658203125e-07, "loss": 0.002, "reward": 1.7388845682144165, "reward_std": 0.05811982438899577, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7545095980167389, "step": 1846 }, { "clip_ratio": 0.0, "completion_length": 301.4609375, "epoch": 0.90185546875, "grad_norm": 3.1836866801117893, "kl": 0.0430908203125, "learning_rate": 7.745361328125e-07, "loss": 0.0017, "reward": 1.8226521015167236, "reward_std": 0.04751377273350954, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8226520419120789, "step": 1847 }, { "clip_ratio": 0.0, "completion_length": 367.1484375, "epoch": 0.90234375, "grad_norm": 1.576033496726682, "kl": 0.0771484375, "learning_rate": 7.744140625e-07, "loss": 0.0031, "reward": 1.7408050298690796, "reward_std": 0.17640165239572525, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7720550298690796, "step": 1848 }, { "clip_ratio": 0.0, "completion_length": 217.8671875, "epoch": 0.90283203125, "grad_norm": 1.7373002567871683, "kl": 0.0501708984375, "learning_rate": 7.742919921875e-07, "loss": 0.002, "reward": 1.8163398504257202, "reward_std": 0.04968139063566923, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8163398206233978, "step": 1849 }, { "clip_ratio": 0.0, "completion_length": 267.84375, "epoch": 0.9033203125, "grad_norm": 3.105106822155688, "kl": 0.0472412109375, "learning_rate": 7.741699218749999e-07, "loss": 0.0019, "reward": 1.8374771475791931, "reward_std": 0.09784207679331303, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8452896475791931, "step": 1850 }, { "clip_ratio": 0.0, "completion_length": 373.21875, "epoch": 0.90380859375, "grad_norm": 1.089428365307123, "kl": 0.0489501953125, "learning_rate": 7.740478515624999e-07, "loss": 0.002, "reward": 1.68122398853302, "reward_std": 0.09672827832400799, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6968489587306976, "step": 1851 }, { "clip_ratio": 0.0, "completion_length": 261.0546875, "epoch": 0.904296875, "grad_norm": 0.9032820450625452, "kl": 0.0506591796875, "learning_rate": 7.739257812499999e-07, "loss": 0.002, "reward": 1.7232590913772583, "reward_std": 0.0855883564800024, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7310715913772583, "step": 1852 }, { "clip_ratio": 0.0, "completion_length": 263.8203125, "epoch": 0.90478515625, "grad_norm": 2.910697140965429, "kl": 0.0596923828125, "learning_rate": 7.738037109375e-07, "loss": 0.0024, "reward": 1.718904733657837, "reward_std": 0.05999594181776047, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7189047038555145, "step": 1853 }, { "clip_ratio": 0.0, "completion_length": 289.71875, "epoch": 0.9052734375, "grad_norm": 2.3512710493981044, "kl": 0.062744140625, "learning_rate": 7.73681640625e-07, "loss": 0.0025, "reward": 1.8199704885482788, "reward_std": 0.17502456158399582, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8199705183506012, "step": 1854 }, { "clip_ratio": 0.0, "completion_length": 333.9453125, "epoch": 0.90576171875, "grad_norm": 1.7378278510340661, "kl": 0.055419921875, "learning_rate": 7.735595703125e-07, "loss": 0.0022, "reward": 1.7747780680656433, "reward_std": 0.08231132477521896, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7747780382633209, "step": 1855 }, { "clip_ratio": 0.0, "completion_length": 309.96875, "epoch": 0.90625, "grad_norm": 1.3484996843881978, "kl": 0.06640625, "learning_rate": 7.734375e-07, "loss": 0.0027, "reward": 1.7257348895072937, "reward_std": 0.08233419992029667, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7335473895072937, "step": 1856 }, { "clip_ratio": 0.0, "completion_length": 353.859375, "epoch": 0.90673828125, "grad_norm": 1.3532753106816202, "kl": 0.0523681640625, "learning_rate": 7.733154296874999e-07, "loss": 0.0021, "reward": 1.6416913270950317, "reward_std": 0.11033252347260714, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6495038270950317, "step": 1857 }, { "clip_ratio": 0.0, "completion_length": 341.0625, "epoch": 0.9072265625, "grad_norm": 2.4747324412246208, "kl": 0.06201171875, "learning_rate": 7.731933593749999e-07, "loss": 0.0025, "reward": 1.686651587486267, "reward_std": 0.11174037307500839, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6944640278816223, "step": 1858 }, { "clip_ratio": 0.0, "completion_length": 367.75, "epoch": 0.90771484375, "grad_norm": 2.5596722124199562, "kl": 0.0435791015625, "learning_rate": 7.730712890625e-07, "loss": 0.0017, "reward": 1.7805684804916382, "reward_std": 0.0784122459590435, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7883809506893158, "step": 1859 }, { "clip_ratio": 0.0, "completion_length": 266.2109375, "epoch": 0.908203125, "grad_norm": 1.9096008823123074, "kl": 0.0513916015625, "learning_rate": 7.7294921875e-07, "loss": 0.0021, "reward": 1.645488977432251, "reward_std": 0.07388130389153957, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6533015072345734, "step": 1860 }, { "clip_ratio": 0.0, "completion_length": 335.8046875, "epoch": 0.90869140625, "grad_norm": 1.9563618836244545, "kl": 0.057861328125, "learning_rate": 7.728271484375e-07, "loss": 0.0023, "reward": 1.600885808467865, "reward_std": 0.12279289960861206, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6086983382701874, "step": 1861 }, { "clip_ratio": 0.0, "completion_length": 308.578125, "epoch": 0.9091796875, "grad_norm": 2.251229228911326, "kl": 0.06201171875, "learning_rate": 7.72705078125e-07, "loss": 0.0025, "reward": 1.6833316087722778, "reward_std": 0.09088350087404251, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6833316385746002, "step": 1862 }, { "clip_ratio": 0.0, "completion_length": 237.953125, "epoch": 0.90966796875, "grad_norm": 4.494937362372943, "kl": 0.0693359375, "learning_rate": 7.725830078124999e-07, "loss": 0.0028, "reward": 1.7052226066589355, "reward_std": 0.0832928977906704, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7052225768566132, "step": 1863 }, { "clip_ratio": 0.0, "completion_length": 216.203125, "epoch": 0.91015625, "grad_norm": 0.8112151237513328, "kl": 0.061279296875, "learning_rate": 7.724609374999999e-07, "loss": 0.0025, "reward": 1.7318594455718994, "reward_std": 0.03134281374514103, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.731859415769577, "step": 1864 }, { "clip_ratio": 0.0, "completion_length": 290.4453125, "epoch": 0.91064453125, "grad_norm": 2.793216129592739, "kl": 0.068603515625, "learning_rate": 7.723388671874999e-07, "loss": 0.0027, "reward": 1.750407099723816, "reward_std": 0.12610271200537682, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7582195699214935, "step": 1865 }, { "clip_ratio": 0.0, "completion_length": 341.59375, "epoch": 0.9111328125, "grad_norm": 1.4003090946656476, "kl": 0.0469970703125, "learning_rate": 7.72216796875e-07, "loss": 0.0019, "reward": 1.7669113874435425, "reward_std": 0.05195538140833378, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7669114768505096, "step": 1866 }, { "clip_ratio": 0.0, "completion_length": 237.6953125, "epoch": 0.91162109375, "grad_norm": 12.65215383630023, "kl": 0.055908203125, "learning_rate": 7.720947265625e-07, "loss": 0.0022, "reward": 1.841326653957367, "reward_std": 0.046704126521945, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8413266539573669, "step": 1867 }, { "clip_ratio": 0.0, "completion_length": 294.6484375, "epoch": 0.912109375, "grad_norm": 1.6673497617014856, "kl": 0.0653076171875, "learning_rate": 7.7197265625e-07, "loss": 0.0026, "reward": 1.7550670504570007, "reward_std": 0.08225375413894653, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7628795802593231, "step": 1868 }, { "clip_ratio": 0.0, "completion_length": 284.6328125, "epoch": 0.91259765625, "grad_norm": 1.5372288043835853, "kl": 0.0531005859375, "learning_rate": 7.718505859375e-07, "loss": 0.0021, "reward": 1.7609045505523682, "reward_std": 0.03866549767553806, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.760904461145401, "step": 1869 }, { "clip_ratio": 0.0, "completion_length": 285.96875, "epoch": 0.9130859375, "grad_norm": 3.1166984800175563, "kl": 0.06201171875, "learning_rate": 7.717285156249999e-07, "loss": 0.0025, "reward": 1.7748718857765198, "reward_std": 0.04408053681254387, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7748719453811646, "step": 1870 }, { "clip_ratio": 0.0, "completion_length": 252.65625, "epoch": 0.91357421875, "grad_norm": 1.146220052210111, "kl": 0.059814453125, "learning_rate": 7.716064453124999e-07, "loss": 0.0024, "reward": 1.8123140931129456, "reward_std": 0.06602787971496582, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8123140633106232, "step": 1871 }, { "clip_ratio": 0.0, "completion_length": 291.8359375, "epoch": 0.9140625, "grad_norm": 2.7210699250281505, "kl": 0.0550537109375, "learning_rate": 7.71484375e-07, "loss": 0.0022, "reward": 1.7665232419967651, "reward_std": 0.012389869894832373, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7665232121944427, "step": 1872 }, { "clip_ratio": 0.0, "completion_length": 326.6953125, "epoch": 0.91455078125, "grad_norm": 1.827791651361238, "kl": 0.05517578125, "learning_rate": 7.713623046875e-07, "loss": 0.0022, "reward": 1.8029692769050598, "reward_std": 0.07247792184352875, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8029692471027374, "step": 1873 }, { "clip_ratio": 0.0, "completion_length": 191.203125, "epoch": 0.9150390625, "grad_norm": 1.5789037304261104, "kl": 0.0679931640625, "learning_rate": 7.71240234375e-07, "loss": 0.0027, "reward": 1.7826859951019287, "reward_std": 0.07684960961341858, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7826859951019287, "step": 1874 }, { "clip_ratio": 0.0, "completion_length": 298.1875, "epoch": 0.91552734375, "grad_norm": 3.343706764989121, "kl": 0.066650390625, "learning_rate": 7.711181640625e-07, "loss": 0.0027, "reward": 1.7790513634681702, "reward_std": 0.032321374863386154, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7790513634681702, "step": 1875 }, { "clip_ratio": 0.0, "completion_length": 405.9609375, "epoch": 0.916015625, "grad_norm": 2.533953944850406, "kl": 0.048095703125, "learning_rate": 7.709960937499999e-07, "loss": 0.0019, "reward": 1.7507587671279907, "reward_std": 0.06509637832641602, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7507588565349579, "step": 1876 }, { "clip_ratio": 0.0, "completion_length": 424.703125, "epoch": 0.91650390625, "grad_norm": 0.7586952119724258, "kl": 0.046875, "learning_rate": 7.708740234374999e-07, "loss": 0.0019, "reward": 1.7596052885055542, "reward_std": 0.12552650086581707, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7830427885055542, "step": 1877 }, { "clip_ratio": 0.0, "completion_length": 349.7890625, "epoch": 0.9169921875, "grad_norm": 1.6249630598487124, "kl": 0.0576171875, "learning_rate": 7.707519531249999e-07, "loss": 0.0023, "reward": 1.6994884610176086, "reward_std": 0.03150587156414986, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6994884312152863, "step": 1878 }, { "clip_ratio": 0.0, "completion_length": 232.3671875, "epoch": 0.91748046875, "grad_norm": 2.026286135731339, "kl": 0.0614013671875, "learning_rate": 7.706298828125e-07, "loss": 0.0025, "reward": 1.8732419610023499, "reward_std": 0.06732478551566601, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8732418417930603, "step": 1879 }, { "clip_ratio": 0.0, "completion_length": 446.859375, "epoch": 0.91796875, "grad_norm": 2.345806175165156, "kl": 0.040771484375, "learning_rate": 7.705078125e-07, "loss": 0.0016, "reward": 1.6373432874679565, "reward_std": 0.19714245945215225, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6764057576656342, "step": 1880 }, { "clip_ratio": 0.0, "completion_length": 266.4765625, "epoch": 0.91845703125, "grad_norm": 2.349540924433874, "kl": 0.0516357421875, "learning_rate": 7.703857421875e-07, "loss": 0.0021, "reward": 1.746773898601532, "reward_std": 0.10035060532391071, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.762398898601532, "step": 1881 }, { "clip_ratio": 0.0, "completion_length": 432.234375, "epoch": 0.9189453125, "grad_norm": 2.33896483912966, "kl": 0.060546875, "learning_rate": 7.70263671875e-07, "loss": 0.0024, "reward": 1.564791977405548, "reward_std": 0.12818468734622002, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5882294774055481, "step": 1882 }, { "clip_ratio": 0.0, "completion_length": 274.984375, "epoch": 0.91943359375, "grad_norm": 2.2240687275141218, "kl": 0.05029296875, "learning_rate": 7.701416015624999e-07, "loss": 0.002, "reward": 1.7980252504348755, "reward_std": 0.08232726529240608, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8058376908302307, "step": 1883 }, { "clip_ratio": 0.0, "completion_length": 330.09375, "epoch": 0.919921875, "grad_norm": 2.8745265642260365, "kl": 0.0523681640625, "learning_rate": 7.700195312499999e-07, "loss": 0.0021, "reward": 1.7382362484931946, "reward_std": 0.12476624548435211, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7538612484931946, "step": 1884 }, { "clip_ratio": 0.0, "completion_length": 294.75, "epoch": 0.92041015625, "grad_norm": 3.686867510401221, "kl": 0.056640625, "learning_rate": 7.698974609375e-07, "loss": 0.0023, "reward": 1.7156809568405151, "reward_std": 0.08216442540287971, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7156809270381927, "step": 1885 }, { "clip_ratio": 0.0, "completion_length": 295.0, "epoch": 0.9208984375, "grad_norm": 6.288769712744168, "kl": 0.0609130859375, "learning_rate": 7.69775390625e-07, "loss": 0.0024, "reward": 1.7297690510749817, "reward_std": 0.04128149338066578, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7297690212726593, "step": 1886 }, { "clip_ratio": 0.0, "completion_length": 237.8125, "epoch": 0.92138671875, "grad_norm": 2.0369024494011256, "kl": 0.0589599609375, "learning_rate": 7.696533203125e-07, "loss": 0.0024, "reward": 1.752552568912506, "reward_std": 0.02822397742420435, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7525525689125061, "step": 1887 }, { "clip_ratio": 0.0, "completion_length": 328.0, "epoch": 0.921875, "grad_norm": 1.2842086090273468, "kl": 0.0498046875, "learning_rate": 7.6953125e-07, "loss": 0.002, "reward": 1.8643844723701477, "reward_std": 0.03368113562464714, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8643843829631805, "step": 1888 }, { "clip_ratio": 0.0, "completion_length": 350.71875, "epoch": 0.92236328125, "grad_norm": 1.318950870858453, "kl": 0.0462646484375, "learning_rate": 7.694091796875e-07, "loss": 0.0019, "reward": 1.648529589176178, "reward_std": 0.057421027682721615, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6563420593738556, "step": 1889 }, { "clip_ratio": 0.0, "completion_length": 282.390625, "epoch": 0.9228515625, "grad_norm": 1.577865853545429, "kl": 0.082275390625, "learning_rate": 7.692871093749999e-07, "loss": 0.0033, "reward": 1.63528710603714, "reward_std": 0.06157683953642845, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6352871656417847, "step": 1890 }, { "clip_ratio": 0.0, "completion_length": 326.3359375, "epoch": 0.92333984375, "grad_norm": 7.518556987855353, "kl": 0.06005859375, "learning_rate": 7.691650390624999e-07, "loss": 0.0024, "reward": 1.722363293170929, "reward_std": 0.1048150509595871, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.730175793170929, "step": 1891 }, { "clip_ratio": 0.0, "completion_length": 324.1640625, "epoch": 0.923828125, "grad_norm": 1.507211128713716, "kl": 0.073486328125, "learning_rate": 7.6904296875e-07, "loss": 0.0029, "reward": 1.7711586356163025, "reward_std": 0.08003518357872963, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7711586952209473, "step": 1892 }, { "clip_ratio": 0.0, "completion_length": 309.4765625, "epoch": 0.92431640625, "grad_norm": 2.085935424953024, "kl": 0.0565185546875, "learning_rate": 7.689208984375e-07, "loss": 0.0023, "reward": 1.7579456567764282, "reward_std": 0.06935618259012699, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7579456567764282, "step": 1893 }, { "clip_ratio": 0.0, "completion_length": 464.4140625, "epoch": 0.9248046875, "grad_norm": 1.3818555531186942, "kl": 0.0494384765625, "learning_rate": 7.68798828125e-07, "loss": 0.002, "reward": 1.8424060940742493, "reward_std": 0.09461657330393791, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8502185940742493, "step": 1894 }, { "clip_ratio": 0.0, "completion_length": 318.734375, "epoch": 0.92529296875, "grad_norm": 1.1368563853147728, "kl": 0.0433349609375, "learning_rate": 7.686767578125e-07, "loss": 0.0017, "reward": 1.7353255152702332, "reward_std": 0.056853363290429115, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7587630748748779, "step": 1895 }, { "clip_ratio": 0.0, "completion_length": 362.1171875, "epoch": 0.92578125, "grad_norm": 2.2064650956492744, "kl": 0.06591796875, "learning_rate": 7.685546874999999e-07, "loss": 0.0026, "reward": 1.7270656824111938, "reward_std": 0.1103198304772377, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7348781824111938, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 314.03125, "epoch": 0.92626953125, "grad_norm": 1.4559467756111681, "kl": 0.06201171875, "learning_rate": 7.684326171874999e-07, "loss": 0.0025, "reward": 1.750020146369934, "reward_std": 0.06828867271542549, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7500201165676117, "step": 1897 }, { "clip_ratio": 0.0, "completion_length": 384.8046875, "epoch": 0.9267578125, "grad_norm": 1.792759355086428, "kl": 0.0550537109375, "learning_rate": 7.68310546875e-07, "loss": 0.0022, "reward": 1.638475477695465, "reward_std": 0.1530410349369049, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6541005373001099, "step": 1898 }, { "clip_ratio": 0.0, "completion_length": 300.265625, "epoch": 0.92724609375, "grad_norm": 4.437485735045951, "kl": 0.0570068359375, "learning_rate": 7.681884765625e-07, "loss": 0.0023, "reward": 1.79349684715271, "reward_std": 0.03604122344404459, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7934968173503876, "step": 1899 }, { "clip_ratio": 0.0, "completion_length": 314.796875, "epoch": 0.927734375, "grad_norm": 2.779442073043061, "kl": 0.0621337890625, "learning_rate": 7.6806640625e-07, "loss": 0.0025, "reward": 1.6727771162986755, "reward_std": 0.0650419145822525, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6727770864963531, "step": 1900 }, { "clip_ratio": 0.0, "completion_length": 313.4609375, "epoch": 0.92822265625, "grad_norm": 1.2941439692735104, "kl": 0.0511474609375, "learning_rate": 7.679443359375e-07, "loss": 0.002, "reward": 1.7836529612541199, "reward_std": 0.10273768194019794, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7992779314517975, "step": 1901 }, { "clip_ratio": 0.0, "completion_length": 298.625, "epoch": 0.9287109375, "grad_norm": 1.451926128352837, "kl": 0.058349609375, "learning_rate": 7.67822265625e-07, "loss": 0.0023, "reward": 1.775130271911621, "reward_std": 0.09728646278381348, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7829427421092987, "step": 1902 }, { "clip_ratio": 0.0, "completion_length": 264.890625, "epoch": 0.92919921875, "grad_norm": 2.2595384689614835, "kl": 0.0582275390625, "learning_rate": 7.677001953124999e-07, "loss": 0.0023, "reward": 1.7409818768501282, "reward_std": 0.0586724728345871, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7409819066524506, "step": 1903 }, { "clip_ratio": 0.0, "completion_length": 303.546875, "epoch": 0.9296875, "grad_norm": 0.6888205497832584, "kl": 0.060791015625, "learning_rate": 7.675781249999999e-07, "loss": 0.0024, "reward": 1.8382083773612976, "reward_std": 0.033903589239344, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.84602090716362, "step": 1904 }, { "clip_ratio": 0.0, "completion_length": 240.578125, "epoch": 0.93017578125, "grad_norm": 3.8284256312930305, "kl": 0.0533447265625, "learning_rate": 7.674560546875e-07, "loss": 0.0021, "reward": 1.8051932454109192, "reward_std": 0.038204182870686054, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8051932752132416, "step": 1905 }, { "clip_ratio": 0.0, "completion_length": 365.984375, "epoch": 0.9306640625, "grad_norm": 1.1662088703849192, "kl": 0.0560302734375, "learning_rate": 7.67333984375e-07, "loss": 0.0022, "reward": 1.6394376754760742, "reward_std": 0.12621871381998062, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6706876754760742, "step": 1906 }, { "clip_ratio": 0.0, "completion_length": 389.7109375, "epoch": 0.93115234375, "grad_norm": 1.0735713355110765, "kl": 0.0506591796875, "learning_rate": 7.672119140625e-07, "loss": 0.002, "reward": 1.775869071483612, "reward_std": 0.038261422887444496, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7758690416812897, "step": 1907 }, { "clip_ratio": 0.0, "completion_length": 350.578125, "epoch": 0.931640625, "grad_norm": 1.1925985760085656, "kl": 0.039306640625, "learning_rate": 7.6708984375e-07, "loss": 0.0016, "reward": 1.8779195547103882, "reward_std": 0.10916906967759132, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.893544614315033, "step": 1908 }, { "clip_ratio": 0.0, "completion_length": 238.703125, "epoch": 0.93212890625, "grad_norm": 0.8754908923158865, "kl": 0.0565185546875, "learning_rate": 7.669677734374999e-07, "loss": 0.0023, "reward": 1.790212869644165, "reward_std": 0.04031490348279476, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7902128398418427, "step": 1909 }, { "clip_ratio": 0.0, "completion_length": 275.78125, "epoch": 0.9326171875, "grad_norm": 1.130446425832009, "kl": 0.0740966796875, "learning_rate": 7.668457031249999e-07, "loss": 0.003, "reward": 1.737060308456421, "reward_std": 0.09539984166622162, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7526853680610657, "step": 1910 }, { "clip_ratio": 0.0, "completion_length": 246.2109375, "epoch": 0.93310546875, "grad_norm": 1.6278036854891171, "kl": 0.0521240234375, "learning_rate": 7.667236328125e-07, "loss": 0.0021, "reward": 1.7540555000305176, "reward_std": 0.04432438686490059, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7540555000305176, "step": 1911 }, { "clip_ratio": 0.0, "completion_length": 377.65625, "epoch": 0.93359375, "grad_norm": 2.1925742495284313, "kl": 0.0595703125, "learning_rate": 7.666015625e-07, "loss": 0.0024, "reward": 1.7391607761383057, "reward_std": 0.06034187972545624, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7391607463359833, "step": 1912 }, { "clip_ratio": 0.0, "completion_length": 259.921875, "epoch": 0.93408203125, "grad_norm": 7.166107534027712, "kl": 0.066650390625, "learning_rate": 7.664794921875e-07, "loss": 0.0027, "reward": 1.7412755489349365, "reward_std": 0.06487971171736717, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7490880191326141, "step": 1913 }, { "clip_ratio": 0.0, "completion_length": 279.40625, "epoch": 0.9345703125, "grad_norm": 1.2690228720660945, "kl": 0.0467529296875, "learning_rate": 7.66357421875e-07, "loss": 0.0019, "reward": 1.739248275756836, "reward_std": 0.03249887889251113, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7392483055591583, "step": 1914 }, { "clip_ratio": 0.0, "completion_length": 377.0078125, "epoch": 0.93505859375, "grad_norm": 1.7901065626462564, "kl": 0.0469970703125, "learning_rate": 7.662353515625e-07, "loss": 0.0019, "reward": 1.7768760919570923, "reward_std": 0.0804726853966713, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7846885025501251, "step": 1915 }, { "clip_ratio": 0.0, "completion_length": 315.75, "epoch": 0.935546875, "grad_norm": 1.5712939062202214, "kl": 0.0623779296875, "learning_rate": 7.661132812499999e-07, "loss": 0.0025, "reward": 1.786317765712738, "reward_std": 0.08603505790233612, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7863178253173828, "step": 1916 }, { "clip_ratio": 0.0, "completion_length": 302.0546875, "epoch": 0.93603515625, "grad_norm": 1.2722576840995556, "kl": 0.055908203125, "learning_rate": 7.659912109374999e-07, "loss": 0.0022, "reward": 1.8206439018249512, "reward_std": 0.05164727196097374, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8206439316272736, "step": 1917 }, { "clip_ratio": 0.0, "completion_length": 271.4140625, "epoch": 0.9365234375, "grad_norm": 0.9577017742295563, "kl": 0.070556640625, "learning_rate": 7.65869140625e-07, "loss": 0.0028, "reward": 1.764600396156311, "reward_std": 0.06939095444977283, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7724128663539886, "step": 1918 }, { "clip_ratio": 0.0, "completion_length": 333.96875, "epoch": 0.93701171875, "grad_norm": 1.1704098534116705, "kl": 0.0428466796875, "learning_rate": 7.657470703125e-07, "loss": 0.0017, "reward": 1.7799164652824402, "reward_std": 0.06408461276441813, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7877289652824402, "step": 1919 }, { "clip_ratio": 0.0, "completion_length": 337.3203125, "epoch": 0.9375, "grad_norm": 1.3081495549462145, "kl": 0.0550537109375, "learning_rate": 7.65625e-07, "loss": 0.0022, "reward": 1.8275092840194702, "reward_std": 0.12271393835544586, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8431342542171478, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 399.5703125, "epoch": 0.93798828125, "grad_norm": 2.0893342759380125, "kl": 0.0570068359375, "learning_rate": 7.655029296875e-07, "loss": 0.0023, "reward": 1.62141752243042, "reward_std": 0.07201961986720562, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6214175224304199, "step": 1921 }, { "clip_ratio": 0.0, "completion_length": 276.0078125, "epoch": 0.9384765625, "grad_norm": 1.199646307200552, "kl": 0.06201171875, "learning_rate": 7.653808593749999e-07, "loss": 0.0025, "reward": 1.7694358825683594, "reward_std": 0.0673837810754776, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.769435852766037, "step": 1922 }, { "clip_ratio": 0.0, "completion_length": 316.625, "epoch": 0.93896484375, "grad_norm": 0.9488691507074507, "kl": 0.0556640625, "learning_rate": 7.652587890624999e-07, "loss": 0.0022, "reward": 1.7294191718101501, "reward_std": 0.06298989057540894, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7294191718101501, "step": 1923 }, { "clip_ratio": 0.0, "completion_length": 308.5390625, "epoch": 0.939453125, "grad_norm": 0.9744407426363972, "kl": 0.0570068359375, "learning_rate": 7.6513671875e-07, "loss": 0.0023, "reward": 1.7692174911499023, "reward_std": 0.12869003787636757, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7926550805568695, "step": 1924 }, { "clip_ratio": 0.0, "completion_length": 305.7265625, "epoch": 0.93994140625, "grad_norm": 8.440290119049392, "kl": 0.048095703125, "learning_rate": 7.650146484375e-07, "loss": 0.0019, "reward": 1.7950489521026611, "reward_std": 0.06832708790898323, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7950489521026611, "step": 1925 }, { "clip_ratio": 0.0, "completion_length": 275.078125, "epoch": 0.9404296875, "grad_norm": 1.8177029228869115, "kl": 0.0528564453125, "learning_rate": 7.64892578125e-07, "loss": 0.0021, "reward": 1.6869670152664185, "reward_std": 0.14585554599761963, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6947795152664185, "step": 1926 }, { "clip_ratio": 0.0, "completion_length": 283.0546875, "epoch": 0.94091796875, "grad_norm": 1.3445342169878876, "kl": 0.0511474609375, "learning_rate": 7.647705078125e-07, "loss": 0.002, "reward": 1.7540799379348755, "reward_std": 0.06522182933986187, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7540798783302307, "step": 1927 }, { "clip_ratio": 0.0, "completion_length": 335.9453125, "epoch": 0.94140625, "grad_norm": 3.3655946441991906, "kl": 0.0565185546875, "learning_rate": 7.646484375e-07, "loss": 0.0023, "reward": 1.79484623670578, "reward_std": 0.07020819000899792, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.79484623670578, "step": 1928 }, { "clip_ratio": 0.0, "completion_length": 325.4921875, "epoch": 0.94189453125, "grad_norm": 2.7406095890255506, "kl": 0.04931640625, "learning_rate": 7.645263671874999e-07, "loss": 0.002, "reward": 1.650872528553009, "reward_std": 0.07257736101746559, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.658685028553009, "step": 1929 }, { "clip_ratio": 0.0, "completion_length": 315.9609375, "epoch": 0.9423828125, "grad_norm": 9.439333223570415, "kl": 0.061279296875, "learning_rate": 7.644042968749999e-07, "loss": 0.0025, "reward": 1.7402021884918213, "reward_std": 0.11112450435757637, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7558271884918213, "step": 1930 }, { "clip_ratio": 0.0, "completion_length": 251.953125, "epoch": 0.94287109375, "grad_norm": 3.0515199906985773, "kl": 0.046875, "learning_rate": 7.642822265625e-07, "loss": 0.0019, "reward": 1.806718111038208, "reward_std": 0.04643261060118675, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8067179918289185, "step": 1931 }, { "clip_ratio": 0.0, "completion_length": 252.828125, "epoch": 0.943359375, "grad_norm": 2.3895842970175463, "kl": 0.044189453125, "learning_rate": 7.6416015625e-07, "loss": 0.0018, "reward": 1.8205534219741821, "reward_std": 0.05191616341471672, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8205534815788269, "step": 1932 }, { "clip_ratio": 0.0, "completion_length": 333.4609375, "epoch": 0.94384765625, "grad_norm": 1.7762575067749533, "kl": 0.0479736328125, "learning_rate": 7.640380859375e-07, "loss": 0.0019, "reward": 1.8342650532722473, "reward_std": 0.06186963617801666, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8342650830745697, "step": 1933 }, { "clip_ratio": 0.0, "completion_length": 430.515625, "epoch": 0.9443359375, "grad_norm": 1.1300241222412084, "kl": 0.0462646484375, "learning_rate": 7.63916015625e-07, "loss": 0.0019, "reward": 1.7758485078811646, "reward_std": 0.0529699232429266, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7758485078811646, "step": 1934 }, { "clip_ratio": 0.0, "completion_length": 256.7890625, "epoch": 0.94482421875, "grad_norm": 1.4612311004000913, "kl": 0.048828125, "learning_rate": 7.637939453124999e-07, "loss": 0.002, "reward": 1.7642263770103455, "reward_std": 0.0411844439804554, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7642263472080231, "step": 1935 }, { "clip_ratio": 0.0, "completion_length": 363.5703125, "epoch": 0.9453125, "grad_norm": 1.0895138935674837, "kl": 0.039306640625, "learning_rate": 7.636718749999999e-07, "loss": 0.0016, "reward": 1.7819878458976746, "reward_std": 0.11622267588973045, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.789800375699997, "step": 1936 }, { "clip_ratio": 0.0, "completion_length": 296.3046875, "epoch": 0.94580078125, "grad_norm": 1.116509437268169, "kl": 0.049072265625, "learning_rate": 7.635498046875e-07, "loss": 0.002, "reward": 1.7431734204292297, "reward_std": 0.06907767802476883, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.750985860824585, "step": 1937 }, { "clip_ratio": 0.0, "completion_length": 209.5625, "epoch": 0.9462890625, "grad_norm": 1.0548626164801436, "kl": 0.05615234375, "learning_rate": 7.63427734375e-07, "loss": 0.0022, "reward": 1.7038698196411133, "reward_std": 0.08116939291357994, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7038698196411133, "step": 1938 }, { "clip_ratio": 0.0, "completion_length": 285.7890625, "epoch": 0.94677734375, "grad_norm": 2.6074049824933714, "kl": 0.063232421875, "learning_rate": 7.633056640625e-07, "loss": 0.0025, "reward": 1.6791431903839111, "reward_std": 0.11179608106613159, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6791431903839111, "step": 1939 }, { "clip_ratio": 0.0, "completion_length": 318.265625, "epoch": 0.947265625, "grad_norm": 4.189314836063207, "kl": 0.0457763671875, "learning_rate": 7.6318359375e-07, "loss": 0.0018, "reward": 1.7924708127975464, "reward_std": 0.04897610656917095, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7924707531929016, "step": 1940 }, { "clip_ratio": 0.0, "completion_length": 309.3828125, "epoch": 0.94775390625, "grad_norm": 1.1917903075664644, "kl": 0.0635986328125, "learning_rate": 7.630615234375e-07, "loss": 0.0025, "reward": 1.7675382494926453, "reward_std": 0.09949354082345963, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7753507494926453, "step": 1941 }, { "clip_ratio": 0.0, "completion_length": 331.703125, "epoch": 0.9482421875, "grad_norm": 0.6607591548251206, "kl": 0.04150390625, "learning_rate": 7.629394531249999e-07, "loss": 0.0017, "reward": 1.8325649499893188, "reward_std": 0.01903275726363063, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8325649201869965, "step": 1942 }, { "clip_ratio": 0.0, "completion_length": 282.6015625, "epoch": 0.94873046875, "grad_norm": 6.132868339589314, "kl": 0.073974609375, "learning_rate": 7.628173828124999e-07, "loss": 0.003, "reward": 1.6471970677375793, "reward_std": 0.060605697333812714, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6550095677375793, "step": 1943 }, { "clip_ratio": 0.0, "completion_length": 298.546875, "epoch": 0.94921875, "grad_norm": 1.4264393045627874, "kl": 0.0548095703125, "learning_rate": 7.626953125e-07, "loss": 0.0022, "reward": 1.7925902605056763, "reward_std": 0.07345704361796379, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8004027009010315, "step": 1944 }, { "clip_ratio": 0.0, "completion_length": 269.046875, "epoch": 0.94970703125, "grad_norm": 1.319335546915897, "kl": 0.049560546875, "learning_rate": 7.625732421875e-07, "loss": 0.002, "reward": 1.7566935420036316, "reward_std": 0.11580286920070648, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7723186016082764, "step": 1945 }, { "clip_ratio": 0.0, "completion_length": 259.7109375, "epoch": 0.9501953125, "grad_norm": 2.0534734192819637, "kl": 0.0673828125, "learning_rate": 7.62451171875e-07, "loss": 0.0027, "reward": 1.7132260203361511, "reward_std": 0.08049709908664227, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7210385203361511, "step": 1946 }, { "clip_ratio": 0.0, "completion_length": 346.140625, "epoch": 0.95068359375, "grad_norm": 2.215564027477621, "kl": 0.0567626953125, "learning_rate": 7.623291015625e-07, "loss": 0.0023, "reward": 1.6834967136383057, "reward_std": 0.04565897583961487, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6834966838359833, "step": 1947 }, { "clip_ratio": 0.0, "completion_length": 266.2734375, "epoch": 0.951171875, "grad_norm": 2.075455827754385, "kl": 0.0616455078125, "learning_rate": 7.622070312499999e-07, "loss": 0.0025, "reward": 1.688484787940979, "reward_std": 0.09864621236920357, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.696297287940979, "step": 1948 }, { "clip_ratio": 0.0, "completion_length": 215.140625, "epoch": 0.95166015625, "grad_norm": 2.250374045260722, "kl": 0.05810546875, "learning_rate": 7.620849609374999e-07, "loss": 0.0023, "reward": 1.8399544954299927, "reward_std": 0.07792560383677483, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8399545550346375, "step": 1949 }, { "clip_ratio": 0.0, "completion_length": 283.359375, "epoch": 0.9521484375, "grad_norm": 1.8302443569238318, "kl": 0.0623779296875, "learning_rate": 7.619628906249999e-07, "loss": 0.0025, "reward": 1.690042495727539, "reward_std": 0.07748877070844173, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6900425255298615, "step": 1950 }, { "clip_ratio": 0.0, "completion_length": 317.0, "epoch": 0.95263671875, "grad_norm": 1.2995294061711256, "kl": 0.056396484375, "learning_rate": 7.618408203125e-07, "loss": 0.0023, "reward": 1.7628344893455505, "reward_std": 0.038826122879981995, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7628344595432281, "step": 1951 }, { "clip_ratio": 0.0, "completion_length": 267.875, "epoch": 0.953125, "grad_norm": 0.8592355590617015, "kl": 0.0640869140625, "learning_rate": 7.6171875e-07, "loss": 0.0026, "reward": 1.6795039176940918, "reward_std": 0.03331707790493965, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6795038878917694, "step": 1952 }, { "clip_ratio": 0.0, "completion_length": 279.90625, "epoch": 0.95361328125, "grad_norm": 7.492200822054816, "kl": 0.08251953125, "learning_rate": 7.615966796875e-07, "loss": 0.0033, "reward": 1.6888149976730347, "reward_std": 0.1778181865811348, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7122524678707123, "step": 1953 }, { "clip_ratio": 0.0, "completion_length": 307.5859375, "epoch": 0.9541015625, "grad_norm": 1.7680737074288075, "kl": 0.060791015625, "learning_rate": 7.61474609375e-07, "loss": 0.0024, "reward": 1.7545133829116821, "reward_std": 0.07330542802810669, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7623258829116821, "step": 1954 }, { "clip_ratio": 0.0, "completion_length": 317.7421875, "epoch": 0.95458984375, "grad_norm": 1.5553080968803128, "kl": 0.0546875, "learning_rate": 7.613525390624999e-07, "loss": 0.0022, "reward": 1.8474570512771606, "reward_std": 0.06518928147852421, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8474570512771606, "step": 1955 }, { "clip_ratio": 0.0, "completion_length": 295.25, "epoch": 0.955078125, "grad_norm": 3.5111551151395557, "kl": 0.0557861328125, "learning_rate": 7.612304687499999e-07, "loss": 0.0022, "reward": 1.6904324293136597, "reward_std": 0.04466338828206062, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6904323995113373, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 330.453125, "epoch": 0.95556640625, "grad_norm": 1.8392727464500473, "kl": 0.066162109375, "learning_rate": 7.611083984375e-07, "loss": 0.0027, "reward": 1.7047904133796692, "reward_std": 0.07609418779611588, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7126030325889587, "step": 1957 }, { "clip_ratio": 0.0, "completion_length": 279.6875, "epoch": 0.9560546875, "grad_norm": 4.599343237367128, "kl": 0.073486328125, "learning_rate": 7.60986328125e-07, "loss": 0.0029, "reward": 1.6910215616226196, "reward_std": 0.06167110428214073, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.691021591424942, "step": 1958 }, { "clip_ratio": 0.0, "completion_length": 308.0703125, "epoch": 0.95654296875, "grad_norm": 1.7998521981288202, "kl": 0.0550537109375, "learning_rate": 7.608642578125e-07, "loss": 0.0022, "reward": 1.8349308371543884, "reward_std": 0.06764688296243548, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8505558371543884, "step": 1959 }, { "clip_ratio": 0.0, "completion_length": 332.53125, "epoch": 0.95703125, "grad_norm": 1.7450693187577557, "kl": 0.057373046875, "learning_rate": 7.607421875e-07, "loss": 0.0023, "reward": 1.5837258696556091, "reward_std": 0.10083448141813278, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5915383994579315, "step": 1960 }, { "clip_ratio": 0.0, "completion_length": 325.9921875, "epoch": 0.95751953125, "grad_norm": 11.114277968224194, "kl": 0.0599365234375, "learning_rate": 7.606201171874999e-07, "loss": 0.0024, "reward": 1.6976925134658813, "reward_std": 0.056451691314578056, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6976925134658813, "step": 1961 }, { "clip_ratio": 0.0, "completion_length": 261.359375, "epoch": 0.9580078125, "grad_norm": 2.105224164489077, "kl": 0.074951171875, "learning_rate": 7.604980468749999e-07, "loss": 0.003, "reward": 1.6166620254516602, "reward_std": 0.08987650275230408, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6244744658470154, "step": 1962 }, { "clip_ratio": 0.0, "completion_length": 289.96875, "epoch": 0.95849609375, "grad_norm": 0.9690709225249844, "kl": 0.0579833984375, "learning_rate": 7.603759765624999e-07, "loss": 0.0023, "reward": 1.6878407001495361, "reward_std": 0.07451405934989452, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6878407299518585, "step": 1963 }, { "clip_ratio": 0.0, "completion_length": 381.375, "epoch": 0.958984375, "grad_norm": 3.78849539137508, "kl": 0.0511474609375, "learning_rate": 7.6025390625e-07, "loss": 0.002, "reward": 1.6614224910736084, "reward_std": 0.15857132896780968, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6770474314689636, "step": 1964 }, { "clip_ratio": 0.0, "completion_length": 386.59375, "epoch": 0.95947265625, "grad_norm": 0.9652488508161328, "kl": 0.0633544921875, "learning_rate": 7.601318359375e-07, "loss": 0.0025, "reward": 1.6580791473388672, "reward_std": 0.11845768243074417, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7049541175365448, "step": 1965 }, { "clip_ratio": 0.0, "completion_length": 264.3125, "epoch": 0.9599609375, "grad_norm": 1.8006023215240339, "kl": 0.071044921875, "learning_rate": 7.60009765625e-07, "loss": 0.0028, "reward": 1.6227675080299377, "reward_std": 0.12311100959777832, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6227675080299377, "step": 1966 }, { "clip_ratio": 0.0, "completion_length": 317.796875, "epoch": 0.96044921875, "grad_norm": 1.2712786186046117, "kl": 0.10009765625, "learning_rate": 7.598876953125e-07, "loss": 0.004, "reward": 1.69350266456604, "reward_std": 0.051613882184028625, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6935026347637177, "step": 1967 }, { "clip_ratio": 0.0, "completion_length": 295.4765625, "epoch": 0.9609375, "grad_norm": 5.457319655705029, "kl": 0.0501708984375, "learning_rate": 7.597656249999999e-07, "loss": 0.002, "reward": 1.769425630569458, "reward_std": 0.040609823539853096, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7694256603717804, "step": 1968 }, { "clip_ratio": 0.0, "completion_length": 382.875, "epoch": 0.96142578125, "grad_norm": 1.1418180815570933, "kl": 0.0518798828125, "learning_rate": 7.596435546874999e-07, "loss": 0.0021, "reward": 1.7559481859207153, "reward_std": 0.03939475491642952, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7559481859207153, "step": 1969 }, { "clip_ratio": 0.0, "completion_length": 344.5546875, "epoch": 0.9619140625, "grad_norm": 2.761562135305564, "kl": 0.0643310546875, "learning_rate": 7.59521484375e-07, "loss": 0.0026, "reward": 1.6901981830596924, "reward_std": 0.044297732412815094, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.69019815325737, "step": 1970 }, { "clip_ratio": 0.0, "completion_length": 331.3125, "epoch": 0.96240234375, "grad_norm": 4.959957474481586, "kl": 0.061279296875, "learning_rate": 7.593994140625e-07, "loss": 0.0024, "reward": 1.7054769396781921, "reward_std": 0.12026718631386757, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7054769396781921, "step": 1971 }, { "clip_ratio": 0.0, "completion_length": 255.2421875, "epoch": 0.962890625, "grad_norm": 1.0195643681683435, "kl": 0.0621337890625, "learning_rate": 7.5927734375e-07, "loss": 0.0025, "reward": 1.7764147520065308, "reward_std": 0.022609219886362553, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7764147520065308, "step": 1972 }, { "clip_ratio": 0.0, "completion_length": 394.8359375, "epoch": 0.96337890625, "grad_norm": 2.7446568495906796, "kl": 0.0595703125, "learning_rate": 7.591552734375e-07, "loss": 0.0024, "reward": 1.7252334952354431, "reward_std": 0.21081995964050293, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.7877334952354431, "step": 1973 }, { "clip_ratio": 0.0, "completion_length": 354.2734375, "epoch": 0.9638671875, "grad_norm": 0.8602906688907507, "kl": 0.068603515625, "learning_rate": 7.59033203125e-07, "loss": 0.0027, "reward": 1.701697051525116, "reward_std": 0.06423486396670341, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7095095813274384, "step": 1974 }, { "clip_ratio": 0.0, "completion_length": 312.953125, "epoch": 0.96435546875, "grad_norm": 2.2124515036462893, "kl": 0.06787109375, "learning_rate": 7.589111328124999e-07, "loss": 0.0027, "reward": 1.7300501465797424, "reward_std": 0.11370932310819626, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7378626465797424, "step": 1975 }, { "clip_ratio": 0.0, "completion_length": 300.4765625, "epoch": 0.96484375, "grad_norm": 1.2753647461496138, "kl": 0.068115234375, "learning_rate": 7.587890624999999e-07, "loss": 0.0027, "reward": 1.7049716711044312, "reward_std": 0.04964887537062168, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7049716711044312, "step": 1976 }, { "clip_ratio": 0.0, "completion_length": 338.2421875, "epoch": 0.96533203125, "grad_norm": 1.3004843395679404, "kl": 0.071044921875, "learning_rate": 7.586669921875e-07, "loss": 0.0028, "reward": 1.7647384405136108, "reward_std": 0.09294159710407257, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7647384405136108, "step": 1977 }, { "clip_ratio": 0.0, "completion_length": 277.484375, "epoch": 0.9658203125, "grad_norm": 2.020273111626442, "kl": 0.073486328125, "learning_rate": 7.58544921875e-07, "loss": 0.0029, "reward": 1.660966157913208, "reward_std": 0.10214090719819069, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6687787771224976, "step": 1978 }, { "clip_ratio": 0.0, "completion_length": 277.21875, "epoch": 0.96630859375, "grad_norm": 3.3338455450291704, "kl": 0.0609130859375, "learning_rate": 7.584228515625e-07, "loss": 0.0024, "reward": 1.774294674396515, "reward_std": 0.07802858576178551, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7742947041988373, "step": 1979 }, { "clip_ratio": 0.0, "completion_length": 295.3046875, "epoch": 0.966796875, "grad_norm": 1.4604029448276346, "kl": 0.0670166015625, "learning_rate": 7.5830078125e-07, "loss": 0.0027, "reward": 1.7650516033172607, "reward_std": 0.10301512852311134, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7806766629219055, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 428.0546875, "epoch": 0.96728515625, "grad_norm": 1.6943394591205752, "kl": 0.0499267578125, "learning_rate": 7.581787109374999e-07, "loss": 0.002, "reward": 1.5032365322113037, "reward_std": 0.17277055978775024, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.5422990322113037, "step": 1981 }, { "clip_ratio": 0.0, "completion_length": 329.65625, "epoch": 0.9677734375, "grad_norm": 2.8707007057084284, "kl": 0.067626953125, "learning_rate": 7.580566406249999e-07, "loss": 0.0027, "reward": 1.7244834303855896, "reward_std": 0.09691913425922394, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7244834303855896, "step": 1982 }, { "clip_ratio": 0.0, "completion_length": 306.1015625, "epoch": 0.96826171875, "grad_norm": 2.317509515130158, "kl": 0.0565185546875, "learning_rate": 7.579345703125e-07, "loss": 0.0023, "reward": 1.8406411409378052, "reward_std": 0.04954299796372652, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8406412601470947, "step": 1983 }, { "clip_ratio": 0.0, "completion_length": 393.0625, "epoch": 0.96875, "grad_norm": 1.1665337790056094, "kl": 0.0460205078125, "learning_rate": 7.578125e-07, "loss": 0.0018, "reward": 1.871698260307312, "reward_std": 0.08394120261073112, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.879510760307312, "step": 1984 }, { "clip_ratio": 0.0, "completion_length": 290.65625, "epoch": 0.96923828125, "grad_norm": 1.5997848800347867, "kl": 0.07080078125, "learning_rate": 7.576904296875e-07, "loss": 0.0028, "reward": 1.6359334588050842, "reward_std": 0.05299815069884062, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6359334290027618, "step": 1985 }, { "clip_ratio": 0.0, "completion_length": 251.8359375, "epoch": 0.9697265625, "grad_norm": 1.5599863734636639, "kl": 0.0576171875, "learning_rate": 7.57568359375e-07, "loss": 0.0023, "reward": 1.8334488272666931, "reward_std": 0.046138789504766464, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8334488272666931, "step": 1986 }, { "clip_ratio": 0.0, "completion_length": 321.8984375, "epoch": 0.97021484375, "grad_norm": 1.444134658928476, "kl": 0.078125, "learning_rate": 7.574462890625e-07, "loss": 0.0031, "reward": 1.6236762404441833, "reward_std": 0.04829781036823988, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6236762404441833, "step": 1987 }, { "clip_ratio": 0.0, "completion_length": 305.1953125, "epoch": 0.970703125, "grad_norm": 1.758426859641077, "kl": 0.0589599609375, "learning_rate": 7.573242187499999e-07, "loss": 0.0024, "reward": 1.7658716440200806, "reward_std": 0.09004973247647285, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7658716142177582, "step": 1988 }, { "clip_ratio": 0.0, "completion_length": 350.7890625, "epoch": 0.97119140625, "grad_norm": 1.6967311663572766, "kl": 0.053955078125, "learning_rate": 7.572021484374999e-07, "loss": 0.0022, "reward": 1.6358023881912231, "reward_std": 0.11093928292393684, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6436149477958679, "step": 1989 }, { "clip_ratio": 0.0, "completion_length": 406.8828125, "epoch": 0.9716796875, "grad_norm": 2.04207112904878, "kl": 0.0439453125, "learning_rate": 7.57080078125e-07, "loss": 0.0018, "reward": 1.81356942653656, "reward_std": 0.050269074738025665, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8213819265365601, "step": 1990 }, { "clip_ratio": 0.0, "completion_length": 355.890625, "epoch": 0.97216796875, "grad_norm": 6.093675872108313, "kl": 0.0615234375, "learning_rate": 7.569580078125e-07, "loss": 0.0025, "reward": 1.7886452674865723, "reward_std": 0.1061076745390892, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7964576780796051, "step": 1991 }, { "clip_ratio": 0.0, "completion_length": 286.5078125, "epoch": 0.97265625, "grad_norm": 3.270820256088369, "kl": 0.06689453125, "learning_rate": 7.568359375e-07, "loss": 0.0027, "reward": 1.8523313999176025, "reward_std": 0.07106838375329971, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8523313105106354, "step": 1992 }, { "clip_ratio": 0.0, "completion_length": 320.21875, "epoch": 0.97314453125, "grad_norm": 1.8113236486460977, "kl": 0.05078125, "learning_rate": 7.567138671875e-07, "loss": 0.002, "reward": 1.7407814264297485, "reward_std": 0.10060215182602406, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7485939860343933, "step": 1993 }, { "clip_ratio": 0.0, "completion_length": 284.203125, "epoch": 0.9736328125, "grad_norm": 3.923636866884382, "kl": 0.0592041015625, "learning_rate": 7.565917968749999e-07, "loss": 0.0024, "reward": 1.7928686141967773, "reward_std": 0.07088093087077141, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.792868584394455, "step": 1994 }, { "clip_ratio": 0.0, "completion_length": 295.65625, "epoch": 0.97412109375, "grad_norm": 1.0205069593502663, "kl": 0.0491943359375, "learning_rate": 7.564697265624999e-07, "loss": 0.002, "reward": 1.820485532283783, "reward_std": 0.034951613284647465, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.820485532283783, "step": 1995 }, { "clip_ratio": 0.0, "completion_length": 398.3125, "epoch": 0.974609375, "grad_norm": 1.3146869441919158, "kl": 0.0570068359375, "learning_rate": 7.5634765625e-07, "loss": 0.0023, "reward": 1.6975049376487732, "reward_std": 0.050795383751392365, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6975049078464508, "step": 1996 }, { "clip_ratio": 0.0, "completion_length": 311.9765625, "epoch": 0.97509765625, "grad_norm": 2.103512397807353, "kl": 0.0506591796875, "learning_rate": 7.562255859375e-07, "loss": 0.002, "reward": 1.7807487845420837, "reward_std": 0.06361746462062001, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7885612845420837, "step": 1997 }, { "clip_ratio": 0.0, "completion_length": 270.015625, "epoch": 0.9755859375, "grad_norm": 9.688121646518365, "kl": 0.076416015625, "learning_rate": 7.56103515625e-07, "loss": 0.0031, "reward": 1.704875409603119, "reward_std": 0.09089740738272667, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7126878798007965, "step": 1998 }, { "clip_ratio": 0.0, "completion_length": 333.953125, "epoch": 0.97607421875, "grad_norm": 1.7750779721775813, "kl": 0.0582275390625, "learning_rate": 7.559814453125e-07, "loss": 0.0023, "reward": 1.7654090523719788, "reward_std": 0.07578187435865402, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7654090225696564, "step": 1999 }, { "clip_ratio": 0.0, "completion_length": 392.6640625, "epoch": 0.9765625, "grad_norm": 6.238100855818762, "kl": 0.0582275390625, "learning_rate": 7.55859375e-07, "loss": 0.0023, "reward": 1.7102959752082825, "reward_std": 0.13805482536554337, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7259210050106049, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 291.859375, "epoch": 0.97705078125, "grad_norm": 4.341560277329405, "kl": 0.054443359375, "learning_rate": 7.557373046874999e-07, "loss": 0.0022, "reward": 1.7868390083312988, "reward_std": 0.04986852779984474, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7868389785289764, "step": 2001 }, { "clip_ratio": 0.0, "completion_length": 361.96875, "epoch": 0.9775390625, "grad_norm": 1.11717831151428, "kl": 0.0552978515625, "learning_rate": 7.556152343749999e-07, "loss": 0.0022, "reward": 1.6372390389442444, "reward_std": 0.04036341607570648, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6372390389442444, "step": 2002 }, { "clip_ratio": 0.0, "completion_length": 332.09375, "epoch": 0.97802734375, "grad_norm": 1.1083383023566933, "kl": 0.0694580078125, "learning_rate": 7.554931640625e-07, "loss": 0.0028, "reward": 1.6198468804359436, "reward_std": 0.08751692995429039, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.62765933573246, "step": 2003 }, { "clip_ratio": 0.0, "completion_length": 245.453125, "epoch": 0.978515625, "grad_norm": 3.2731263361849963, "kl": 0.0699462890625, "learning_rate": 7.5537109375e-07, "loss": 0.0028, "reward": 1.7005380988121033, "reward_std": 0.07136748731136322, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7005380988121033, "step": 2004 }, { "clip_ratio": 0.0, "completion_length": 321.125, "epoch": 0.97900390625, "grad_norm": 4.932970901055641, "kl": 0.056396484375, "learning_rate": 7.552490234375e-07, "loss": 0.0023, "reward": 1.799683392047882, "reward_std": 0.060001108795404434, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7996833622455597, "step": 2005 }, { "clip_ratio": 0.0, "completion_length": 313.4140625, "epoch": 0.9794921875, "grad_norm": 1.5941330883157996, "kl": 0.0606689453125, "learning_rate": 7.55126953125e-07, "loss": 0.0024, "reward": 1.773667812347412, "reward_std": 0.058895327150821686, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7892928421497345, "step": 2006 }, { "clip_ratio": 0.0, "completion_length": 330.0703125, "epoch": 0.97998046875, "grad_norm": 1.3767151749195397, "kl": 0.0626220703125, "learning_rate": 7.550048828124999e-07, "loss": 0.0025, "reward": 1.7313017845153809, "reward_std": 0.08502375334501266, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7391143441200256, "step": 2007 }, { "clip_ratio": 0.0, "completion_length": 307.9453125, "epoch": 0.98046875, "grad_norm": 2.620215945991878, "kl": 0.0706787109375, "learning_rate": 7.548828124999999e-07, "loss": 0.0028, "reward": 1.762086808681488, "reward_std": 0.09928128868341446, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.769899308681488, "step": 2008 }, { "clip_ratio": 0.0, "completion_length": 288.2734375, "epoch": 0.98095703125, "grad_norm": 3.8108526407917065, "kl": 0.0550537109375, "learning_rate": 7.547607421875e-07, "loss": 0.0022, "reward": 1.695708990097046, "reward_std": 0.052436916157603264, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6957089602947235, "step": 2009 }, { "clip_ratio": 0.0, "completion_length": 318.9140625, "epoch": 0.9814453125, "grad_norm": 1.733038006998932, "kl": 0.0518798828125, "learning_rate": 7.54638671875e-07, "loss": 0.0021, "reward": 1.791795015335083, "reward_std": 0.05815849453210831, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.791795015335083, "step": 2010 }, { "clip_ratio": 0.0, "completion_length": 277.28125, "epoch": 0.98193359375, "grad_norm": 3.978067562200423, "kl": 0.069091796875, "learning_rate": 7.545166015625e-07, "loss": 0.0028, "reward": 1.7538942098617554, "reward_std": 0.10411181300878525, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7617067396640778, "step": 2011 }, { "clip_ratio": 0.0, "completion_length": 256.6875, "epoch": 0.982421875, "grad_norm": 5.320638926802918, "kl": 0.062255859375, "learning_rate": 7.5439453125e-07, "loss": 0.0025, "reward": 1.755543053150177, "reward_std": 0.07938620075583458, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.755543053150177, "step": 2012 }, { "clip_ratio": 0.0, "completion_length": 263.5390625, "epoch": 0.98291015625, "grad_norm": 2.7469000275015745, "kl": 0.058837890625, "learning_rate": 7.542724609375e-07, "loss": 0.0024, "reward": 1.6718215942382812, "reward_std": 0.09080488607287407, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6718215942382812, "step": 2013 }, { "clip_ratio": 0.0, "completion_length": 314.5703125, "epoch": 0.9833984375, "grad_norm": 1.7108353133321768, "kl": 0.0496826171875, "learning_rate": 7.541503906249999e-07, "loss": 0.002, "reward": 1.7525351643562317, "reward_std": 0.06778106465935707, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7525351941585541, "step": 2014 }, { "clip_ratio": 0.0, "completion_length": 392.4765625, "epoch": 0.98388671875, "grad_norm": 1.4726300800994188, "kl": 0.051513671875, "learning_rate": 7.540283203124999e-07, "loss": 0.0021, "reward": 1.7786903977394104, "reward_std": 0.03883876092731953, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7786904275417328, "step": 2015 }, { "clip_ratio": 0.0, "completion_length": 294.390625, "epoch": 0.984375, "grad_norm": 2.204556954621402, "kl": 0.0611572265625, "learning_rate": 7.5390625e-07, "loss": 0.0024, "reward": 1.788576900959015, "reward_std": 0.05577550455927849, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7885768711566925, "step": 2016 }, { "clip_ratio": 0.0, "completion_length": 344.125, "epoch": 0.98486328125, "grad_norm": 1.666818054703339, "kl": 0.048095703125, "learning_rate": 7.537841796875e-07, "loss": 0.0019, "reward": 1.7710611820220947, "reward_std": 0.1724838688969612, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.8101237118244171, "step": 2017 }, { "clip_ratio": 0.0, "completion_length": 287.90625, "epoch": 0.9853515625, "grad_norm": 1.4819925595821943, "kl": 0.055419921875, "learning_rate": 7.53662109375e-07, "loss": 0.0022, "reward": 1.844101369380951, "reward_std": 0.11457358300685883, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8519138097763062, "step": 2018 }, { "clip_ratio": 0.0, "completion_length": 319.1640625, "epoch": 0.98583984375, "grad_norm": 0.863402575226768, "kl": 0.057373046875, "learning_rate": 7.535400390625e-07, "loss": 0.0023, "reward": 1.8190799951553345, "reward_std": 0.028183109126985073, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8190799951553345, "step": 2019 }, { "clip_ratio": 0.0, "completion_length": 278.9609375, "epoch": 0.986328125, "grad_norm": 1.693685033843237, "kl": 0.0555419921875, "learning_rate": 7.534179687499999e-07, "loss": 0.0022, "reward": 1.8829106092453003, "reward_std": 0.03998455451801419, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8829106092453003, "step": 2020 }, { "clip_ratio": 0.0, "completion_length": 290.8046875, "epoch": 0.98681640625, "grad_norm": 1.5870071937237666, "kl": 0.0589599609375, "learning_rate": 7.532958984374999e-07, "loss": 0.0024, "reward": 1.7445420026779175, "reward_std": 0.05026637949049473, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7445419728755951, "step": 2021 }, { "clip_ratio": 0.0, "completion_length": 345.890625, "epoch": 0.9873046875, "grad_norm": 1.6854847331554719, "kl": 0.0511474609375, "learning_rate": 7.53173828125e-07, "loss": 0.002, "reward": 1.7165476083755493, "reward_std": 0.1217353455722332, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7399851083755493, "step": 2022 }, { "clip_ratio": 0.0, "completion_length": 266.1875, "epoch": 0.98779296875, "grad_norm": 3.876567272736264, "kl": 0.0650634765625, "learning_rate": 7.530517578125e-07, "loss": 0.0026, "reward": 1.648318886756897, "reward_std": 0.11310148239135742, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.656131386756897, "step": 2023 }, { "clip_ratio": 0.0, "completion_length": 250.765625, "epoch": 0.98828125, "grad_norm": 17.515994932795444, "kl": 0.0703125, "learning_rate": 7.529296875e-07, "loss": 0.0028, "reward": 1.788986623287201, "reward_std": 0.04432579409331083, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7889866828918457, "step": 2024 }, { "clip_ratio": 0.0, "completion_length": 357.109375, "epoch": 0.98876953125, "grad_norm": 3.17134976005998, "kl": 0.056640625, "learning_rate": 7.528076171875e-07, "loss": 0.0023, "reward": 1.741170048713684, "reward_std": 0.1307641789317131, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7489825487136841, "step": 2025 }, { "clip_ratio": 0.0, "completion_length": 202.671875, "epoch": 0.9892578125, "grad_norm": 0.6360837641452156, "kl": 0.063720703125, "learning_rate": 7.52685546875e-07, "loss": 0.0025, "reward": 1.833198606967926, "reward_std": 0.033319685608148575, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8331986367702484, "step": 2026 }, { "clip_ratio": 0.0, "completion_length": 300.640625, "epoch": 0.98974609375, "grad_norm": 2.1336492840699175, "kl": 0.057373046875, "learning_rate": 7.525634765624999e-07, "loss": 0.0023, "reward": 1.6746537685394287, "reward_std": 0.06350501254200935, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6746538877487183, "step": 2027 }, { "clip_ratio": 0.0, "completion_length": 224.0703125, "epoch": 0.990234375, "grad_norm": 4.759167762843545, "kl": 0.064453125, "learning_rate": 7.524414062499999e-07, "loss": 0.0026, "reward": 1.7838603258132935, "reward_std": 0.09581628814339638, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7838603258132935, "step": 2028 }, { "clip_ratio": 0.0, "completion_length": 382.75, "epoch": 0.99072265625, "grad_norm": 1.6603971592521138, "kl": 0.063232421875, "learning_rate": 7.523193359375e-07, "loss": 0.0025, "reward": 1.6969883441925049, "reward_std": 0.08057832717895508, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6969882547855377, "step": 2029 }, { "clip_ratio": 0.0, "completion_length": 368.3125, "epoch": 0.9912109375, "grad_norm": 1.180993131330784, "kl": 0.0533447265625, "learning_rate": 7.52197265625e-07, "loss": 0.0021, "reward": 1.6063008308410645, "reward_std": 0.17890335619449615, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6453633606433868, "step": 2030 }, { "clip_ratio": 0.0, "completion_length": 366.0234375, "epoch": 0.99169921875, "grad_norm": 1.095038299634487, "kl": 0.0523681640625, "learning_rate": 7.520751953125e-07, "loss": 0.0021, "reward": 1.779970109462738, "reward_std": 0.08126384392380714, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7955950498580933, "step": 2031 }, { "clip_ratio": 0.0, "completion_length": 336.8984375, "epoch": 0.9921875, "grad_norm": 2.6979143608108265, "kl": 0.0673828125, "learning_rate": 7.51953125e-07, "loss": 0.0027, "reward": 1.740858554840088, "reward_std": 0.09589342772960663, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7408585846424103, "step": 2032 }, { "clip_ratio": 0.0, "completion_length": 281.609375, "epoch": 0.99267578125, "grad_norm": 2.848196704773645, "kl": 0.05908203125, "learning_rate": 7.518310546874999e-07, "loss": 0.0024, "reward": 1.7175134420394897, "reward_std": 0.028206244111061096, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.717513382434845, "step": 2033 }, { "clip_ratio": 0.0, "completion_length": 318.7890625, "epoch": 0.9931640625, "grad_norm": 3.5557294743989054, "kl": 0.0601806640625, "learning_rate": 7.517089843749999e-07, "loss": 0.0024, "reward": 1.8203625082969666, "reward_std": 0.060490844771265984, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8203625082969666, "step": 2034 }, { "clip_ratio": 0.0, "completion_length": 346.21875, "epoch": 0.99365234375, "grad_norm": 1.016603411671492, "kl": 0.0650634765625, "learning_rate": 7.515869140625e-07, "loss": 0.0026, "reward": 1.7698943614959717, "reward_std": 0.05688617751002312, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7698944211006165, "step": 2035 }, { "clip_ratio": 0.0, "completion_length": 339.234375, "epoch": 0.994140625, "grad_norm": 1.1139849774388095, "kl": 0.0531005859375, "learning_rate": 7.5146484375e-07, "loss": 0.0021, "reward": 1.829136312007904, "reward_std": 0.030549502931535244, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8291363418102264, "step": 2036 }, { "clip_ratio": 0.0, "completion_length": 267.609375, "epoch": 0.99462890625, "grad_norm": 1.9862434556565576, "kl": 0.08837890625, "learning_rate": 7.513427734375e-07, "loss": 0.0035, "reward": 1.7540498971939087, "reward_std": 0.019931727088987827, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7540498673915863, "step": 2037 }, { "clip_ratio": 0.0, "completion_length": 249.328125, "epoch": 0.9951171875, "grad_norm": 1.245846895225628, "kl": 0.066650390625, "learning_rate": 7.51220703125e-07, "loss": 0.0027, "reward": 1.775355041027069, "reward_std": 0.03905859775841236, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7753550708293915, "step": 2038 }, { "clip_ratio": 0.0, "completion_length": 303.9609375, "epoch": 0.99560546875, "grad_norm": 5.356625380854759, "kl": 0.0936279296875, "learning_rate": 7.510986328125e-07, "loss": 0.0037, "reward": 1.7761430740356445, "reward_std": 0.08487707003951073, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7761430144309998, "step": 2039 }, { "clip_ratio": 0.0, "completion_length": 336.609375, "epoch": 0.99609375, "grad_norm": 0.8395847923657573, "kl": 0.0499267578125, "learning_rate": 7.509765624999999e-07, "loss": 0.002, "reward": 1.8729313015937805, "reward_std": 0.02620452456176281, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8729313611984253, "step": 2040 }, { "clip_ratio": 0.0, "completion_length": 281.625, "epoch": 0.99658203125, "grad_norm": 2.022128826305776, "kl": 0.052734375, "learning_rate": 7.508544921874999e-07, "loss": 0.0021, "reward": 1.725024938583374, "reward_std": 0.12236949801445007, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.732837438583374, "step": 2041 }, { "clip_ratio": 0.0, "completion_length": 280.859375, "epoch": 0.9970703125, "grad_norm": 15.315894492212703, "kl": 0.0631103515625, "learning_rate": 7.50732421875e-07, "loss": 0.0025, "reward": 1.7646106481552124, "reward_std": 0.09433956071734428, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7724231779575348, "step": 2042 }, { "clip_ratio": 0.0, "completion_length": 309.3671875, "epoch": 0.99755859375, "grad_norm": 8.3408933772686, "kl": 0.07275390625, "learning_rate": 7.506103515625e-07, "loss": 0.0029, "reward": 1.7954939603805542, "reward_std": 0.060113584622740746, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.795494019985199, "step": 2043 }, { "clip_ratio": 0.0, "completion_length": 302.3515625, "epoch": 0.998046875, "grad_norm": 2.462496443372101, "kl": 0.0589599609375, "learning_rate": 7.5048828125e-07, "loss": 0.0024, "reward": 1.7298526167869568, "reward_std": 0.10707394033670425, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7298526465892792, "step": 2044 }, { "clip_ratio": 0.0, "completion_length": 273.171875, "epoch": 0.99853515625, "grad_norm": 2.026729878970871, "kl": 0.0565185546875, "learning_rate": 7.503662109375e-07, "loss": 0.0023, "reward": 1.7790692448616028, "reward_std": 0.07894434407353401, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7868817448616028, "step": 2045 }, { "clip_ratio": 0.0, "completion_length": 307.328125, "epoch": 0.9990234375, "grad_norm": 2.3300701728667375, "kl": 0.0714111328125, "learning_rate": 7.502441406249999e-07, "loss": 0.0029, "reward": 1.7110105156898499, "reward_std": 0.15207843482494354, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7422605454921722, "step": 2046 }, { "clip_ratio": 0.0, "completion_length": 379.68031311035156, "epoch": 0.99951171875, "grad_norm": 1.141841899027945, "kl": 0.05810546875, "learning_rate": 7.501220703124999e-07, "loss": 0.0024, "reward": 1.8359350562095642, "reward_std": 0.14897098392248154, "rewards/format_reward": 0.9754097759723663, "rewards/ocr_reward": 0.8605252206325531, "step": 2047 }, { "clip_ratio": 0.0, "completion_length": 268.5625, "epoch": 1.00048828125, "grad_norm": 3.803502318684469, "kl": 0.0550537109375, "learning_rate": 7.5e-07, "loss": 0.0022, "reward": 1.8090072274208069, "reward_std": 0.06662950664758682, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8090072870254517, "step": 2048 }, { "clip_ratio": 0.0, "completion_length": 266.046875, "epoch": 1.0009765625, "grad_norm": 0.7862946563465897, "kl": 0.06298828125, "learning_rate": 7.498779296875e-07, "loss": 0.0025, "reward": 1.6712990999221802, "reward_std": 0.02280174382030964, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6712990701198578, "step": 2049 }, { "clip_ratio": 0.0, "completion_length": 293.5078125, "epoch": 1.00146484375, "grad_norm": 1.342303670259137, "kl": 0.0587158203125, "learning_rate": 7.49755859375e-07, "loss": 0.0023, "reward": 1.840239703655243, "reward_std": 0.023719463497400284, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8402397036552429, "step": 2050 }, { "clip_ratio": 0.0, "completion_length": 267.3828125, "epoch": 1.001953125, "grad_norm": 0.9572879054411417, "kl": 0.0562744140625, "learning_rate": 7.496337890625e-07, "loss": 0.0023, "reward": 1.8105382919311523, "reward_std": 0.09160010330379009, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8183507919311523, "step": 2051 }, { "clip_ratio": 0.0, "completion_length": 382.640625, "epoch": 1.00244140625, "grad_norm": 1.6102653045301574, "kl": 0.05078125, "learning_rate": 7.4951171875e-07, "loss": 0.002, "reward": 1.6806508302688599, "reward_std": 0.1597279291599989, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7275258004665375, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 471.0, "epoch": 1.0029296875, "grad_norm": 1.4804709133667804, "kl": 0.050537109375, "learning_rate": 7.493896484374999e-07, "loss": 0.002, "reward": 1.804791808128357, "reward_std": 0.11673609726130962, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8204168379306793, "step": 2053 }, { "clip_ratio": 0.0, "completion_length": 285.546875, "epoch": 1.00341796875, "grad_norm": 1.9657773937285294, "kl": 0.056396484375, "learning_rate": 7.492675781249999e-07, "loss": 0.0023, "reward": 1.8031712174415588, "reward_std": 0.11098561063408852, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8109836876392365, "step": 2054 }, { "clip_ratio": 0.0, "completion_length": 437.2265625, "epoch": 1.00390625, "grad_norm": 1.6895047631875901, "kl": 0.053955078125, "learning_rate": 7.491455078125e-07, "loss": 0.0022, "reward": 1.648207187652588, "reward_std": 0.2681009843945503, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7028946876525879, "step": 2055 }, { "clip_ratio": 0.0, "completion_length": 352.71875, "epoch": 1.00439453125, "grad_norm": 3.0164245238907608, "kl": 0.0562744140625, "learning_rate": 7.490234375e-07, "loss": 0.0023, "reward": 1.7384542226791382, "reward_std": 0.09973935224115849, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7462667226791382, "step": 2056 }, { "clip_ratio": 0.0, "completion_length": 303.0078125, "epoch": 1.0048828125, "grad_norm": 1.4926521960283892, "kl": 0.0579833984375, "learning_rate": 7.489013671875e-07, "loss": 0.0023, "reward": 1.8106223940849304, "reward_std": 0.02411152981221676, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8106224536895752, "step": 2057 }, { "clip_ratio": 0.0, "completion_length": 337.171875, "epoch": 1.00537109375, "grad_norm": 1.418929150113678, "kl": 0.0513916015625, "learning_rate": 7.48779296875e-07, "loss": 0.0021, "reward": 1.733910322189331, "reward_std": 0.11833417788147926, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7417227923870087, "step": 2058 }, { "clip_ratio": 0.0, "completion_length": 344.359375, "epoch": 1.005859375, "grad_norm": 2.1455363975809427, "kl": 0.0540771484375, "learning_rate": 7.486572265624999e-07, "loss": 0.0022, "reward": 1.7620959281921387, "reward_std": 0.13864869251847267, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7933458983898163, "step": 2059 }, { "clip_ratio": 0.0, "completion_length": 352.7578125, "epoch": 1.00634765625, "grad_norm": 1.4711170006450227, "kl": 0.0628662109375, "learning_rate": 7.485351562499999e-07, "loss": 0.0025, "reward": 1.8471481800079346, "reward_std": 0.04248751141130924, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8471481502056122, "step": 2060 }, { "clip_ratio": 0.0, "completion_length": 326.0859375, "epoch": 1.0068359375, "grad_norm": 1.6633546292914565, "kl": 0.0538330078125, "learning_rate": 7.484130859374999e-07, "loss": 0.0022, "reward": 1.6721433401107788, "reward_std": 0.08107060939073563, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.695580929517746, "step": 2061 }, { "clip_ratio": 0.0, "completion_length": 387.359375, "epoch": 1.00732421875, "grad_norm": 2.7740994248622304, "kl": 0.0533447265625, "learning_rate": 7.48291015625e-07, "loss": 0.0021, "reward": 1.72020024061203, "reward_std": 0.09270552173256874, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.72801274061203, "step": 2062 }, { "clip_ratio": 0.0, "completion_length": 296.21875, "epoch": 1.0078125, "grad_norm": 1.8930966049480564, "kl": 0.0594482421875, "learning_rate": 7.481689453125e-07, "loss": 0.0024, "reward": 1.8536216616630554, "reward_std": 0.06580028869211674, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8692466914653778, "step": 2063 }, { "clip_ratio": 0.0, "completion_length": 268.9453125, "epoch": 1.00830078125, "grad_norm": 5.427251340055625, "kl": 0.0599365234375, "learning_rate": 7.48046875e-07, "loss": 0.0024, "reward": 1.766296148300171, "reward_std": 0.07562026381492615, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7819211483001709, "step": 2064 }, { "clip_ratio": 0.0, "completion_length": 246.7265625, "epoch": 1.0087890625, "grad_norm": 1.6714670851072095, "kl": 0.0499267578125, "learning_rate": 7.479248046875e-07, "loss": 0.002, "reward": 1.8139212131500244, "reward_std": 0.08759323135018349, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8217337727546692, "step": 2065 }, { "clip_ratio": 0.0, "completion_length": 458.5234375, "epoch": 1.00927734375, "grad_norm": 1.4835031644240106, "kl": 0.0606689453125, "learning_rate": 7.478027343749999e-07, "loss": 0.0024, "reward": 1.5658519268035889, "reward_std": 0.17432072386145592, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.5971018970012665, "step": 2066 }, { "clip_ratio": 0.0, "completion_length": 339.8125, "epoch": 1.009765625, "grad_norm": 1.002420848386625, "kl": 0.06494140625, "learning_rate": 7.476806640624999e-07, "loss": 0.0026, "reward": 1.7776638269424438, "reward_std": 0.08371632359921932, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7854763865470886, "step": 2067 }, { "clip_ratio": 0.0, "completion_length": 250.859375, "epoch": 1.01025390625, "grad_norm": 1.8316432776762526, "kl": 0.055908203125, "learning_rate": 7.4755859375e-07, "loss": 0.0022, "reward": 1.7354426980018616, "reward_std": 0.08602847345173359, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.735442727804184, "step": 2068 }, { "clip_ratio": 0.0, "completion_length": 290.9609375, "epoch": 1.0107421875, "grad_norm": 0.8791339224441008, "kl": 0.0540771484375, "learning_rate": 7.474365234375e-07, "loss": 0.0022, "reward": 1.8300225734710693, "reward_std": 0.035275645554065704, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.830022543668747, "step": 2069 }, { "clip_ratio": 0.0, "completion_length": 357.171875, "epoch": 1.01123046875, "grad_norm": 1.4982277249016536, "kl": 0.04736328125, "learning_rate": 7.47314453125e-07, "loss": 0.0019, "reward": 1.8361040353775024, "reward_std": 0.03841123543679714, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8361040949821472, "step": 2070 }, { "clip_ratio": 0.0, "completion_length": 337.234375, "epoch": 1.01171875, "grad_norm": 0.6837506894975471, "kl": 0.0616455078125, "learning_rate": 7.471923828125e-07, "loss": 0.0025, "reward": 1.788848340511322, "reward_std": 0.058198969811201096, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7888484299182892, "step": 2071 }, { "clip_ratio": 0.0, "completion_length": 313.6015625, "epoch": 1.01220703125, "grad_norm": 1.443895916653485, "kl": 0.077392578125, "learning_rate": 7.470703125e-07, "loss": 0.0031, "reward": 1.6802573204040527, "reward_std": 0.11816703528165817, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6880699098110199, "step": 2072 }, { "clip_ratio": 0.0, "completion_length": 270.5859375, "epoch": 1.0126953125, "grad_norm": 6.059167846056267, "kl": 0.065673828125, "learning_rate": 7.469482421874999e-07, "loss": 0.0026, "reward": 1.7388933897018433, "reward_std": 0.07620543800294399, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7388934195041656, "step": 2073 }, { "clip_ratio": 0.0, "completion_length": 265.5078125, "epoch": 1.01318359375, "grad_norm": 2.3817776057089093, "kl": 0.06494140625, "learning_rate": 7.468261718749999e-07, "loss": 0.0026, "reward": 1.7072933316230774, "reward_std": 0.0740668810904026, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.707293301820755, "step": 2074 }, { "clip_ratio": 0.0, "completion_length": 214.8671875, "epoch": 1.013671875, "grad_norm": 1.3422401028685387, "kl": 0.072998046875, "learning_rate": 7.467041015625e-07, "loss": 0.0029, "reward": 1.733224332332611, "reward_std": 0.06582791358232498, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7332243919372559, "step": 2075 }, { "clip_ratio": 0.0, "completion_length": 364.046875, "epoch": 1.01416015625, "grad_norm": 0.9975798038955767, "kl": 0.061767578125, "learning_rate": 7.4658203125e-07, "loss": 0.0025, "reward": 1.645218014717102, "reward_std": 0.02710463386029005, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.645218014717102, "step": 2076 }, { "clip_ratio": 0.0, "completion_length": 265.125, "epoch": 1.0146484375, "grad_norm": 1.565205712470546, "kl": 0.07861328125, "learning_rate": 7.464599609375e-07, "loss": 0.0031, "reward": 1.6886191368103027, "reward_std": 0.03423266182653606, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.688619077205658, "step": 2077 }, { "clip_ratio": 0.0, "completion_length": 244.8984375, "epoch": 1.01513671875, "grad_norm": 2.642889571657758, "kl": 0.0645751953125, "learning_rate": 7.46337890625e-07, "loss": 0.0026, "reward": 1.787465751171112, "reward_std": 0.04049981106072664, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7874657511711121, "step": 2078 }, { "clip_ratio": 0.0, "completion_length": 331.75, "epoch": 1.015625, "grad_norm": 2.426800209345326, "kl": 0.0562744140625, "learning_rate": 7.462158203124999e-07, "loss": 0.0023, "reward": 1.71806001663208, "reward_std": 0.030627473257482052, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7180599868297577, "step": 2079 }, { "clip_ratio": 0.0, "completion_length": 404.2890625, "epoch": 1.01611328125, "grad_norm": 5.675745821516389, "kl": 0.062255859375, "learning_rate": 7.460937499999999e-07, "loss": 0.0025, "reward": 1.7228577136993408, "reward_std": 0.10195358097553253, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7306701838970184, "step": 2080 }, { "clip_ratio": 0.0, "completion_length": 365.0390625, "epoch": 1.0166015625, "grad_norm": 3.7013692307890995, "kl": 0.060302734375, "learning_rate": 7.459716796875e-07, "loss": 0.0024, "reward": 1.9432930946350098, "reward_std": 0.15386238880455494, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9432931840419769, "step": 2081 }, { "clip_ratio": 0.0, "completion_length": 296.078125, "epoch": 1.01708984375, "grad_norm": 2.032809989173598, "kl": 0.076416015625, "learning_rate": 7.45849609375e-07, "loss": 0.0031, "reward": 1.7166752815246582, "reward_std": 0.0454743467271328, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7166752815246582, "step": 2082 }, { "clip_ratio": 0.0, "completion_length": 294.65625, "epoch": 1.017578125, "grad_norm": 3.864692981065383, "kl": 0.0621337890625, "learning_rate": 7.457275390625e-07, "loss": 0.0025, "reward": 1.8227131962776184, "reward_std": 0.0507346335798502, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.822713166475296, "step": 2083 }, { "clip_ratio": 0.0, "completion_length": 378.96875, "epoch": 1.01806640625, "grad_norm": 1.0624085888750547, "kl": 0.05126953125, "learning_rate": 7.4560546875e-07, "loss": 0.0021, "reward": 1.7786809802055359, "reward_std": 0.10613211244344711, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7864934504032135, "step": 2084 }, { "clip_ratio": 0.0, "completion_length": 312.7265625, "epoch": 1.0185546875, "grad_norm": 2.255916713761177, "kl": 0.075927734375, "learning_rate": 7.454833984375e-07, "loss": 0.003, "reward": 1.7361916899681091, "reward_std": 0.1337970271706581, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7440041303634644, "step": 2085 }, { "clip_ratio": 0.0, "completion_length": 288.0859375, "epoch": 1.01904296875, "grad_norm": 3.5672731701933262, "kl": 0.0615234375, "learning_rate": 7.453613281249999e-07, "loss": 0.0025, "reward": 1.834282636642456, "reward_std": 0.04526693467050791, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8342825770378113, "step": 2086 }, { "clip_ratio": 0.0, "completion_length": 343.8515625, "epoch": 1.01953125, "grad_norm": 1.1390787112646121, "kl": 0.0599365234375, "learning_rate": 7.452392578124999e-07, "loss": 0.0024, "reward": 1.8966719508171082, "reward_std": 0.14545264467597008, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.9044845402240753, "step": 2087 }, { "clip_ratio": 0.0, "completion_length": 300.3671875, "epoch": 1.02001953125, "grad_norm": 1.1424293003741923, "kl": 0.05712890625, "learning_rate": 7.451171875e-07, "loss": 0.0023, "reward": 1.776362955570221, "reward_std": 0.08313069678843021, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7841754257678986, "step": 2088 }, { "clip_ratio": 0.0, "completion_length": 285.109375, "epoch": 1.0205078125, "grad_norm": 1.2961993858286134, "kl": 0.0609130859375, "learning_rate": 7.449951171875e-07, "loss": 0.0024, "reward": 1.7457592487335205, "reward_std": 0.06624248251318932, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7457592785358429, "step": 2089 }, { "clip_ratio": 0.0, "completion_length": 302.46875, "epoch": 1.02099609375, "grad_norm": 2.9122243864865833, "kl": 0.0662841796875, "learning_rate": 7.44873046875e-07, "loss": 0.0027, "reward": 1.7659137845039368, "reward_std": 0.09149673208594322, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7815387845039368, "step": 2090 }, { "clip_ratio": 0.0, "completion_length": 347.5703125, "epoch": 1.021484375, "grad_norm": 1.7837165721571753, "kl": 0.0560302734375, "learning_rate": 7.447509765625e-07, "loss": 0.0022, "reward": 1.7793264389038086, "reward_std": 0.07757101766765118, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.779326468706131, "step": 2091 }, { "clip_ratio": 0.0, "completion_length": 271.6875, "epoch": 1.02197265625, "grad_norm": 2.6677175324191764, "kl": 0.0672607421875, "learning_rate": 7.446289062499999e-07, "loss": 0.0027, "reward": 1.7153024673461914, "reward_std": 0.06006733886897564, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7153024673461914, "step": 2092 }, { "clip_ratio": 0.0, "completion_length": 253.6484375, "epoch": 1.0224609375, "grad_norm": 8.212754923875098, "kl": 0.06787109375, "learning_rate": 7.445068359374999e-07, "loss": 0.0027, "reward": 1.731951892375946, "reward_std": 0.08415350876748562, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7319517731666565, "step": 2093 }, { "clip_ratio": 0.0, "completion_length": 327.8359375, "epoch": 1.02294921875, "grad_norm": 1.0081680404176827, "kl": 0.059326171875, "learning_rate": 7.44384765625e-07, "loss": 0.0024, "reward": 1.7743852734565735, "reward_std": 0.06954375258646905, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7743852734565735, "step": 2094 }, { "clip_ratio": 0.0, "completion_length": 343.75, "epoch": 1.0234375, "grad_norm": 2.3423641729543694, "kl": 0.07568359375, "learning_rate": 7.442626953125e-07, "loss": 0.003, "reward": 1.8153335452079773, "reward_std": 0.06889502890408039, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8153335452079773, "step": 2095 }, { "clip_ratio": 0.0, "completion_length": 348.1171875, "epoch": 1.02392578125, "grad_norm": 0.9005825292004632, "kl": 0.06982421875, "learning_rate": 7.44140625e-07, "loss": 0.0028, "reward": 1.7364252805709839, "reward_std": 0.14264655858278275, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7676753103733063, "step": 2096 }, { "clip_ratio": 0.0, "completion_length": 260.9609375, "epoch": 1.0244140625, "grad_norm": 1.2457369377691487, "kl": 0.0660400390625, "learning_rate": 7.440185546875e-07, "loss": 0.0026, "reward": 1.8361563086509705, "reward_std": 0.03824903070926666, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8361562192440033, "step": 2097 }, { "clip_ratio": 0.0, "completion_length": 309.1171875, "epoch": 1.02490234375, "grad_norm": 4.994694191481811, "kl": 0.070556640625, "learning_rate": 7.43896484375e-07, "loss": 0.0028, "reward": 1.7435556650161743, "reward_std": 0.08704771101474762, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7513681650161743, "step": 2098 }, { "clip_ratio": 0.0, "completion_length": 334.4140625, "epoch": 1.025390625, "grad_norm": 1.291822341118298, "kl": 0.069091796875, "learning_rate": 7.437744140624999e-07, "loss": 0.0028, "reward": 1.729736089706421, "reward_std": 0.05247452110052109, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7297360599040985, "step": 2099 }, { "clip_ratio": 0.0, "completion_length": 302.8671875, "epoch": 1.02587890625, "grad_norm": 1.6815056470609293, "kl": 0.063720703125, "learning_rate": 7.436523437499999e-07, "loss": 0.0026, "reward": 1.813611924648285, "reward_std": 0.048070028424263, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8136118948459625, "step": 2100 }, { "clip_ratio": 0.0, "completion_length": 269.421875, "epoch": 1.0263671875, "grad_norm": 2.2241505771119563, "kl": 0.07421875, "learning_rate": 7.435302734375e-07, "loss": 0.003, "reward": 1.8664068579673767, "reward_std": 0.04198060557246208, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8664068281650543, "step": 2101 }, { "clip_ratio": 0.0, "completion_length": 337.03125, "epoch": 1.02685546875, "grad_norm": 1.3461961566886906, "kl": 0.0640869140625, "learning_rate": 7.43408203125e-07, "loss": 0.0026, "reward": 1.7394928336143494, "reward_std": 0.11257979273796082, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7473053336143494, "step": 2102 }, { "clip_ratio": 0.0, "completion_length": 335.8046875, "epoch": 1.02734375, "grad_norm": 0.7096896402429057, "kl": 0.0537109375, "learning_rate": 7.432861328125e-07, "loss": 0.0022, "reward": 1.7980987429618835, "reward_std": 0.1013258621096611, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8137237429618835, "step": 2103 }, { "clip_ratio": 0.0, "completion_length": 233.5625, "epoch": 1.02783203125, "grad_norm": 0.9569277555942687, "kl": 0.0828857421875, "learning_rate": 7.431640625e-07, "loss": 0.0033, "reward": 1.8101829886436462, "reward_std": 0.045853691175580025, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8101829886436462, "step": 2104 }, { "clip_ratio": 0.0, "completion_length": 279.3125, "epoch": 1.0283203125, "grad_norm": 4.832884831931728, "kl": 0.0743408203125, "learning_rate": 7.430419921874999e-07, "loss": 0.003, "reward": 1.7681997418403625, "reward_std": 0.12344841938465834, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7760122716426849, "step": 2105 }, { "clip_ratio": 0.0, "completion_length": 258.640625, "epoch": 1.02880859375, "grad_norm": 3.3258791760247073, "kl": 0.06689453125, "learning_rate": 7.429199218749999e-07, "loss": 0.0027, "reward": 1.7421391010284424, "reward_std": 0.05865258723497391, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7421391606330872, "step": 2106 }, { "clip_ratio": 0.0, "completion_length": 355.921875, "epoch": 1.029296875, "grad_norm": 2.6371942953766623, "kl": 0.066650390625, "learning_rate": 7.427978515625e-07, "loss": 0.0027, "reward": 1.6952205896377563, "reward_std": 0.1647278480231762, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7420955300331116, "step": 2107 }, { "clip_ratio": 0.0, "completion_length": 306.5234375, "epoch": 1.02978515625, "grad_norm": 2.0605570090114793, "kl": 0.058349609375, "learning_rate": 7.4267578125e-07, "loss": 0.0023, "reward": 1.773497223854065, "reward_std": 0.22887670993804932, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8047472238540649, "step": 2108 }, { "clip_ratio": 0.0, "completion_length": 242.0, "epoch": 1.0302734375, "grad_norm": 2.492828590978083, "kl": 0.06640625, "learning_rate": 7.425537109375e-07, "loss": 0.0027, "reward": 1.7053377032279968, "reward_std": 0.1504954844713211, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7287752032279968, "step": 2109 }, { "clip_ratio": 0.0, "completion_length": 307.8984375, "epoch": 1.03076171875, "grad_norm": 2.696773965633565, "kl": 0.07080078125, "learning_rate": 7.42431640625e-07, "loss": 0.0028, "reward": 1.8146610260009766, "reward_std": 0.12038443237543106, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8302860260009766, "step": 2110 }, { "clip_ratio": 0.0, "completion_length": 335.90625, "epoch": 1.03125, "grad_norm": 1.8989371313788723, "kl": 0.0650634765625, "learning_rate": 7.423095703125e-07, "loss": 0.0026, "reward": 1.7706368565559387, "reward_std": 0.2233428657054901, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.8096993565559387, "step": 2111 }, { "clip_ratio": 0.0, "completion_length": 373.6640625, "epoch": 1.03173828125, "grad_norm": 3.6497635388956224, "kl": 0.140625, "learning_rate": 7.421874999999999e-07, "loss": 0.0056, "reward": 1.586828589439392, "reward_std": 0.30886563658714294, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6493285894393921, "step": 2112 }, { "clip_ratio": 0.0, "completion_length": 361.25, "epoch": 1.0322265625, "grad_norm": 0.9780344434658866, "kl": 0.06201171875, "learning_rate": 7.420654296874999e-07, "loss": 0.0025, "reward": 1.62257981300354, "reward_std": 0.3575499951839447, "rewards/format_reward": 0.9140625, "rewards/ocr_reward": 0.70851731300354, "step": 2113 }, { "clip_ratio": 0.0, "completion_length": 371.1640625, "epoch": 1.03271484375, "grad_norm": 5.949761627807109, "kl": 0.065185546875, "learning_rate": 7.41943359375e-07, "loss": 0.0026, "reward": 1.6299309730529785, "reward_std": 0.39395518600940704, "rewards/format_reward": 0.9140625, "rewards/ocr_reward": 0.7158684730529785, "step": 2114 }, { "clip_ratio": 0.0, "completion_length": 299.59375, "epoch": 1.033203125, "grad_norm": 1.059347621749564, "kl": 0.062744140625, "learning_rate": 7.418212890625e-07, "loss": 0.0025, "reward": 1.7748578786849976, "reward_std": 0.18243324011564255, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8061079382896423, "step": 2115 }, { "clip_ratio": 0.0, "completion_length": 430.53125, "epoch": 1.03369140625, "grad_norm": 0.6484997754740194, "kl": 0.0679931640625, "learning_rate": 7.4169921875e-07, "loss": 0.0027, "reward": 1.6713528037071228, "reward_std": 0.351689875125885, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.7494778335094452, "step": 2116 }, { "clip_ratio": 0.0, "completion_length": 385.7109375, "epoch": 1.0341796875, "grad_norm": 2.682455320676119, "kl": 0.0614013671875, "learning_rate": 7.415771484375e-07, "loss": 0.0025, "reward": 1.7464085221290588, "reward_std": 0.1380649134516716, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7698459923267365, "step": 2117 }, { "clip_ratio": 0.0, "completion_length": 316.53125, "epoch": 1.03466796875, "grad_norm": 0.9380873546654088, "kl": 0.0721435546875, "learning_rate": 7.414550781249999e-07, "loss": 0.0029, "reward": 1.8222784996032715, "reward_std": 0.10763486847281456, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8379034996032715, "step": 2118 }, { "clip_ratio": 0.0, "completion_length": 323.125, "epoch": 1.03515625, "grad_norm": 1.7089694241065019, "kl": 0.066650390625, "learning_rate": 7.413330078124999e-07, "loss": 0.0027, "reward": 1.680859923362732, "reward_std": 0.1279044784605503, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7121100127696991, "step": 2119 }, { "clip_ratio": 0.0, "completion_length": 368.1015625, "epoch": 1.03564453125, "grad_norm": 1.184804985444957, "kl": 0.0596923828125, "learning_rate": 7.412109375e-07, "loss": 0.0024, "reward": 1.7026260495185852, "reward_std": 0.0845637135207653, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7104385793209076, "step": 2120 }, { "clip_ratio": 0.0, "completion_length": 312.8984375, "epoch": 1.0361328125, "grad_norm": 2.4689879281350584, "kl": 0.0672607421875, "learning_rate": 7.410888671875e-07, "loss": 0.0027, "reward": 1.6870477795600891, "reward_std": 0.08825328946113586, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6948603093624115, "step": 2121 }, { "clip_ratio": 0.0, "completion_length": 255.4609375, "epoch": 1.03662109375, "grad_norm": 2.637185167724473, "kl": 0.073974609375, "learning_rate": 7.40966796875e-07, "loss": 0.003, "reward": 1.7651128768920898, "reward_std": 0.04329609777778387, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7651128768920898, "step": 2122 }, { "clip_ratio": 0.0, "completion_length": 350.7109375, "epoch": 1.037109375, "grad_norm": 15.312775728411562, "kl": 0.059814453125, "learning_rate": 7.408447265625e-07, "loss": 0.0024, "reward": 1.7873907089233398, "reward_std": 0.053318215534090996, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7873907387256622, "step": 2123 }, { "clip_ratio": 0.0, "completion_length": 275.375, "epoch": 1.03759765625, "grad_norm": 2.1162197557292513, "kl": 0.086181640625, "learning_rate": 7.4072265625e-07, "loss": 0.0034, "reward": 1.637177586555481, "reward_std": 0.0317330677062273, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6371775567531586, "step": 2124 }, { "clip_ratio": 0.0, "completion_length": 349.6171875, "epoch": 1.0380859375, "grad_norm": 5.324096223818963, "kl": 0.058837890625, "learning_rate": 7.406005859374999e-07, "loss": 0.0024, "reward": 1.7509996891021729, "reward_std": 0.06184336729347706, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7509996891021729, "step": 2125 }, { "clip_ratio": 0.0, "completion_length": 299.1875, "epoch": 1.03857421875, "grad_norm": 0.9586175770588368, "kl": 0.0633544921875, "learning_rate": 7.404785156249999e-07, "loss": 0.0025, "reward": 1.820866346359253, "reward_std": 0.11166086047887802, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.8599288463592529, "step": 2126 }, { "clip_ratio": 0.0, "completion_length": 276.234375, "epoch": 1.0390625, "grad_norm": 1.9880497607902226, "kl": 0.063232421875, "learning_rate": 7.403564453125e-07, "loss": 0.0025, "reward": 1.682478904724121, "reward_std": 0.03829295188188553, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6824789345264435, "step": 2127 }, { "clip_ratio": 0.0, "completion_length": 345.234375, "epoch": 1.03955078125, "grad_norm": 0.9645795198072094, "kl": 0.0556640625, "learning_rate": 7.40234375e-07, "loss": 0.0022, "reward": 1.6462610960006714, "reward_std": 0.03332418855279684, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.646261066198349, "step": 2128 }, { "clip_ratio": 0.0, "completion_length": 362.3671875, "epoch": 1.0400390625, "grad_norm": 1.5537365872327011, "kl": 0.072509765625, "learning_rate": 7.401123046875e-07, "loss": 0.0029, "reward": 1.7381998300552368, "reward_std": 0.12407108163461089, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.792887270450592, "step": 2129 }, { "clip_ratio": 0.0, "completion_length": 288.203125, "epoch": 1.04052734375, "grad_norm": 2.7636605699535606, "kl": 0.0556640625, "learning_rate": 7.39990234375e-07, "loss": 0.0022, "reward": 1.6674214005470276, "reward_std": 0.03596335183829069, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.66742143034935, "step": 2130 }, { "clip_ratio": 0.0, "completion_length": 223.0859375, "epoch": 1.041015625, "grad_norm": 1.2451863266792458, "kl": 0.07861328125, "learning_rate": 7.398681640624999e-07, "loss": 0.0031, "reward": 1.7419597506523132, "reward_std": 0.0864316001534462, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7497721910476685, "step": 2131 }, { "clip_ratio": 0.0, "completion_length": 359.0390625, "epoch": 1.04150390625, "grad_norm": 1.5936629511193108, "kl": 0.0469970703125, "learning_rate": 7.397460937499999e-07, "loss": 0.0019, "reward": 1.6851105093955994, "reward_std": 0.0796204935759306, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.692922979593277, "step": 2132 }, { "clip_ratio": 0.0, "completion_length": 287.2421875, "epoch": 1.0419921875, "grad_norm": 3.690196248709246, "kl": 0.0635986328125, "learning_rate": 7.396240234375e-07, "loss": 0.0025, "reward": 1.7646411657333374, "reward_std": 0.08438229188323021, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7724536657333374, "step": 2133 }, { "clip_ratio": 0.0, "completion_length": 407.53125, "epoch": 1.04248046875, "grad_norm": 2.1257395476547387, "kl": 0.04931640625, "learning_rate": 7.39501953125e-07, "loss": 0.002, "reward": 1.6398783326148987, "reward_std": 0.19317952543497086, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.7180033326148987, "step": 2134 }, { "clip_ratio": 0.0, "completion_length": 281.9765625, "epoch": 1.04296875, "grad_norm": 0.8869364175450244, "kl": 0.0697021484375, "learning_rate": 7.393798828125e-07, "loss": 0.0028, "reward": 1.6968178749084473, "reward_std": 0.11358075961470604, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7046304047107697, "step": 2135 }, { "clip_ratio": 0.0, "completion_length": 232.8828125, "epoch": 1.04345703125, "grad_norm": 2.2017880020005243, "kl": 0.064453125, "learning_rate": 7.392578125e-07, "loss": 0.0026, "reward": 1.795127511024475, "reward_std": 0.044227102771401405, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7951274216175079, "step": 2136 }, { "clip_ratio": 0.0, "completion_length": 348.1328125, "epoch": 1.0439453125, "grad_norm": 2.79303089558888, "kl": 0.0516357421875, "learning_rate": 7.391357421875e-07, "loss": 0.0021, "reward": 1.7270812392234802, "reward_std": 0.11675109714269638, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7348937392234802, "step": 2137 }, { "clip_ratio": 0.0, "completion_length": 274.0546875, "epoch": 1.04443359375, "grad_norm": 1.05482411179448, "kl": 0.0499267578125, "learning_rate": 7.390136718749999e-07, "loss": 0.002, "reward": 1.8569371104240417, "reward_std": 0.027390625327825546, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8569370210170746, "step": 2138 }, { "clip_ratio": 0.0, "completion_length": 266.578125, "epoch": 1.044921875, "grad_norm": 3.373738723448789, "kl": 0.0653076171875, "learning_rate": 7.388916015624999e-07, "loss": 0.0026, "reward": 1.7933273315429688, "reward_std": 0.04443395556882024, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7933273315429688, "step": 2139 }, { "clip_ratio": 0.0, "completion_length": 340.296875, "epoch": 1.04541015625, "grad_norm": 0.9983903579493524, "kl": 0.046630859375, "learning_rate": 7.3876953125e-07, "loss": 0.0019, "reward": 1.681038737297058, "reward_std": 0.07797625940293074, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7201012670993805, "step": 2140 }, { "clip_ratio": 0.0, "completion_length": 344.15625, "epoch": 1.0458984375, "grad_norm": 1.8879030546706417, "kl": 0.0511474609375, "learning_rate": 7.386474609375e-07, "loss": 0.002, "reward": 1.6292105913162231, "reward_std": 0.21559580974280834, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6838980913162231, "step": 2141 }, { "clip_ratio": 0.0, "completion_length": 325.0078125, "epoch": 1.04638671875, "grad_norm": 1.3052945256803228, "kl": 0.0604248046875, "learning_rate": 7.38525390625e-07, "loss": 0.0024, "reward": 1.658437967300415, "reward_std": 0.11939615942537785, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6896880269050598, "step": 2142 }, { "clip_ratio": 0.0, "completion_length": 288.453125, "epoch": 1.046875, "grad_norm": 1.1710672837881737, "kl": 0.0650634765625, "learning_rate": 7.384033203125e-07, "loss": 0.0026, "reward": 1.8534250855445862, "reward_std": 0.0744034256786108, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8612376153469086, "step": 2143 }, { "clip_ratio": 0.0, "completion_length": 430.9765625, "epoch": 1.04736328125, "grad_norm": 5.639567766118087, "kl": 0.057861328125, "learning_rate": 7.382812499999999e-07, "loss": 0.0023, "reward": 1.7248152494430542, "reward_std": 0.13105768337845802, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7482527196407318, "step": 2144 }, { "clip_ratio": 0.0, "completion_length": 247.171875, "epoch": 1.0478515625, "grad_norm": 2.0263228158297935, "kl": 0.0672607421875, "learning_rate": 7.381591796874999e-07, "loss": 0.0027, "reward": 1.6987344622612, "reward_std": 0.07333962060511112, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6987345218658447, "step": 2145 }, { "clip_ratio": 0.0, "completion_length": 332.328125, "epoch": 1.04833984375, "grad_norm": 0.9474657856304045, "kl": 0.045654296875, "learning_rate": 7.38037109375e-07, "loss": 0.0018, "reward": 1.8015679717063904, "reward_std": 0.06693462654948235, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8093804717063904, "step": 2146 }, { "clip_ratio": 0.0, "completion_length": 287.84375, "epoch": 1.048828125, "grad_norm": 1.899611714790272, "kl": 0.069580078125, "learning_rate": 7.379150390625e-07, "loss": 0.0028, "reward": 1.776853621006012, "reward_std": 0.07761351764202118, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7846660614013672, "step": 2147 }, { "clip_ratio": 0.0, "completion_length": 273.53125, "epoch": 1.04931640625, "grad_norm": 1.2684760918178732, "kl": 0.072265625, "learning_rate": 7.3779296875e-07, "loss": 0.0029, "reward": 1.7402900457382202, "reward_std": 0.050347575917840004, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.740289956331253, "step": 2148 }, { "clip_ratio": 0.0, "completion_length": 270.546875, "epoch": 1.0498046875, "grad_norm": 0.8851627060755088, "kl": 0.068603515625, "learning_rate": 7.376708984375e-07, "loss": 0.0028, "reward": 1.66942298412323, "reward_std": 0.06483565643429756, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6694230437278748, "step": 2149 }, { "clip_ratio": 0.0, "completion_length": 290.9765625, "epoch": 1.05029296875, "grad_norm": 1.0989385292696576, "kl": 0.0732421875, "learning_rate": 7.37548828125e-07, "loss": 0.0029, "reward": 1.734316349029541, "reward_std": 0.10244572162628174, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7421287596225739, "step": 2150 }, { "clip_ratio": 0.0, "completion_length": 314.375, "epoch": 1.05078125, "grad_norm": 1.3053195164204594, "kl": 0.0589599609375, "learning_rate": 7.374267578124999e-07, "loss": 0.0024, "reward": 1.7859277725219727, "reward_std": 0.09848207421600819, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8015527129173279, "step": 2151 }, { "clip_ratio": 0.0, "completion_length": 249.9609375, "epoch": 1.05126953125, "grad_norm": 4.695553247711008, "kl": 0.0660400390625, "learning_rate": 7.373046874999999e-07, "loss": 0.0026, "reward": 1.8197780847549438, "reward_std": 0.04821081645786762, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8197780549526215, "step": 2152 }, { "clip_ratio": 0.0, "completion_length": 338.5703125, "epoch": 1.0517578125, "grad_norm": 1.4462058737833505, "kl": 0.0604248046875, "learning_rate": 7.371826171875e-07, "loss": 0.0024, "reward": 1.705611228942871, "reward_std": 0.04024476930499077, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7056111991405487, "step": 2153 }, { "clip_ratio": 0.0, "completion_length": 289.7109375, "epoch": 1.05224609375, "grad_norm": 3.3498088491590505, "kl": 0.05126953125, "learning_rate": 7.37060546875e-07, "loss": 0.002, "reward": 1.7300852537155151, "reward_std": 0.08649061527103186, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7378977537155151, "step": 2154 }, { "clip_ratio": 0.0, "completion_length": 282.6328125, "epoch": 1.052734375, "grad_norm": 1.7860979292641492, "kl": 0.066650390625, "learning_rate": 7.369384765625e-07, "loss": 0.0027, "reward": 1.749430775642395, "reward_std": 0.05913347005844116, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.749430775642395, "step": 2155 }, { "clip_ratio": 0.0, "completion_length": 250.8828125, "epoch": 1.05322265625, "grad_norm": 14.752756244291026, "kl": 0.0562744140625, "learning_rate": 7.3681640625e-07, "loss": 0.0023, "reward": 1.8864418268203735, "reward_std": 0.051150595769286156, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8942543268203735, "step": 2156 }, { "clip_ratio": 0.0, "completion_length": 286.8359375, "epoch": 1.0537109375, "grad_norm": 1.4880684525219599, "kl": 0.065673828125, "learning_rate": 7.366943359374999e-07, "loss": 0.0026, "reward": 1.7557436227798462, "reward_std": 0.07325353659689426, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.763556182384491, "step": 2157 }, { "clip_ratio": 0.0, "completion_length": 309.8203125, "epoch": 1.05419921875, "grad_norm": 1.227305473803353, "kl": 0.05078125, "learning_rate": 7.365722656249999e-07, "loss": 0.002, "reward": 1.6383469700813293, "reward_std": 0.07099130935966969, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6461593806743622, "step": 2158 }, { "clip_ratio": 0.0, "completion_length": 334.7109375, "epoch": 1.0546875, "grad_norm": 0.7035510885180098, "kl": 0.0516357421875, "learning_rate": 7.364501953124999e-07, "loss": 0.0021, "reward": 1.8514134287834167, "reward_std": 0.047458380460739136, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8514134883880615, "step": 2159 }, { "clip_ratio": 0.0, "completion_length": 355.8984375, "epoch": 1.05517578125, "grad_norm": 1.1121154130874542, "kl": 0.0587158203125, "learning_rate": 7.36328125e-07, "loss": 0.0023, "reward": 1.8369617462158203, "reward_std": 0.05468747764825821, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8369618058204651, "step": 2160 }, { "clip_ratio": 0.0, "completion_length": 296.53125, "epoch": 1.0556640625, "grad_norm": 3.7658263078887484, "kl": 0.056640625, "learning_rate": 7.362060546875e-07, "loss": 0.0023, "reward": 1.7871454954147339, "reward_std": 0.10257207229733467, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7949579656124115, "step": 2161 }, { "clip_ratio": 0.0, "completion_length": 320.6171875, "epoch": 1.05615234375, "grad_norm": 1.1368885648860967, "kl": 0.065673828125, "learning_rate": 7.36083984375e-07, "loss": 0.0026, "reward": 1.685727596282959, "reward_std": 0.13240405172109604, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.693540096282959, "step": 2162 }, { "clip_ratio": 0.0, "completion_length": 311.2421875, "epoch": 1.056640625, "grad_norm": 1.4737797498246885, "kl": 0.070068359375, "learning_rate": 7.359619140625e-07, "loss": 0.0028, "reward": 1.6595805883407593, "reward_std": 0.034869059920310974, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6595805883407593, "step": 2163 }, { "clip_ratio": 0.0, "completion_length": 350.234375, "epoch": 1.05712890625, "grad_norm": 4.315861274519875, "kl": 0.0625, "learning_rate": 7.358398437499999e-07, "loss": 0.0025, "reward": 1.779079794883728, "reward_std": 0.05369388684630394, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7790797650814056, "step": 2164 }, { "clip_ratio": 0.0, "completion_length": 320.6875, "epoch": 1.0576171875, "grad_norm": 2.32247382157244, "kl": 0.0498046875, "learning_rate": 7.357177734374999e-07, "loss": 0.002, "reward": 1.7220231294631958, "reward_std": 0.049990251660346985, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7220230996608734, "step": 2165 }, { "clip_ratio": 0.0, "completion_length": 334.7109375, "epoch": 1.05810546875, "grad_norm": 1.3332007134810149, "kl": 0.0616455078125, "learning_rate": 7.35595703125e-07, "loss": 0.0025, "reward": 1.776341736316681, "reward_std": 0.09242127742618322, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8075916767120361, "step": 2166 }, { "clip_ratio": 0.0, "completion_length": 257.34375, "epoch": 1.05859375, "grad_norm": 2.707627459785679, "kl": 0.08056640625, "learning_rate": 7.354736328125e-07, "loss": 0.0032, "reward": 1.7095162868499756, "reward_std": 0.0930749960243702, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7173287868499756, "step": 2167 }, { "clip_ratio": 0.0, "completion_length": 341.8671875, "epoch": 1.05908203125, "grad_norm": 1.5790115563997662, "kl": 0.0679931640625, "learning_rate": 7.353515625e-07, "loss": 0.0027, "reward": 1.833968698978424, "reward_std": 0.1615981161594391, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8574061989784241, "step": 2168 }, { "clip_ratio": 0.0, "completion_length": 265.4609375, "epoch": 1.0595703125, "grad_norm": 1.216637779595074, "kl": 0.067138671875, "learning_rate": 7.352294921875e-07, "loss": 0.0027, "reward": 1.8065576553344727, "reward_std": 0.048122160136699677, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8065576553344727, "step": 2169 }, { "clip_ratio": 0.0, "completion_length": 303.9375, "epoch": 1.06005859375, "grad_norm": 1.7503160947547827, "kl": 0.0791015625, "learning_rate": 7.35107421875e-07, "loss": 0.0032, "reward": 1.676461935043335, "reward_std": 0.07576654106378555, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.684274435043335, "step": 2170 }, { "clip_ratio": 0.0, "completion_length": 345.359375, "epoch": 1.060546875, "grad_norm": 2.082251913418121, "kl": 0.07080078125, "learning_rate": 7.349853515624999e-07, "loss": 0.0028, "reward": 1.5780660510063171, "reward_std": 0.08183467015624046, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5780660212039948, "step": 2171 }, { "clip_ratio": 0.0, "completion_length": 384.2265625, "epoch": 1.06103515625, "grad_norm": 1.3618484884392383, "kl": 0.066162109375, "learning_rate": 7.348632812499999e-07, "loss": 0.0026, "reward": 1.6594600677490234, "reward_std": 0.1468387171626091, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6985225975513458, "step": 2172 }, { "clip_ratio": 0.0, "completion_length": 225.78125, "epoch": 1.0615234375, "grad_norm": 1.1921095797514936, "kl": 0.091552734375, "learning_rate": 7.347412109375e-07, "loss": 0.0037, "reward": 1.7010453343391418, "reward_std": 0.07318814843893051, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7088578939437866, "step": 2173 }, { "clip_ratio": 0.0, "completion_length": 416.5546875, "epoch": 1.06201171875, "grad_norm": 0.8033656102067087, "kl": 0.0726318359375, "learning_rate": 7.34619140625e-07, "loss": 0.0029, "reward": 1.5880872011184692, "reward_std": 0.21278053149580956, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6271496415138245, "step": 2174 }, { "clip_ratio": 0.0, "completion_length": 294.2109375, "epoch": 1.0625, "grad_norm": 1.5869611446935907, "kl": 0.07666015625, "learning_rate": 7.344970703125e-07, "loss": 0.0031, "reward": 1.7660531997680664, "reward_std": 0.09016413614153862, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7738656997680664, "step": 2175 }, { "clip_ratio": 0.0, "completion_length": 243.4921875, "epoch": 1.06298828125, "grad_norm": 1.677072145006438, "kl": 0.072021484375, "learning_rate": 7.34375e-07, "loss": 0.0029, "reward": 1.797443151473999, "reward_std": 0.06796230189502239, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7974432110786438, "step": 2176 }, { "clip_ratio": 0.0, "completion_length": 275.8515625, "epoch": 1.0634765625, "grad_norm": 1.3737869324375402, "kl": 0.0567626953125, "learning_rate": 7.342529296874999e-07, "loss": 0.0023, "reward": 1.7921919226646423, "reward_std": 0.03785792738199234, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7921919226646423, "step": 2177 }, { "clip_ratio": 0.0, "completion_length": 383.09375, "epoch": 1.06396484375, "grad_norm": 0.8350475539832674, "kl": 0.052734375, "learning_rate": 7.341308593749999e-07, "loss": 0.0021, "reward": 1.8188701272010803, "reward_std": 0.13417918607592583, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8423075675964355, "step": 2178 }, { "clip_ratio": 0.0, "completion_length": 390.6015625, "epoch": 1.064453125, "grad_norm": 2.0295192660753987, "kl": 0.076171875, "learning_rate": 7.340087890625e-07, "loss": 0.003, "reward": 1.6831302642822266, "reward_std": 0.11725856736302376, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6987552344799042, "step": 2179 }, { "clip_ratio": 0.0, "completion_length": 395.7734375, "epoch": 1.06494140625, "grad_norm": 1.823650973787344, "kl": 0.083251953125, "learning_rate": 7.3388671875e-07, "loss": 0.0033, "reward": 1.677744746208191, "reward_std": 0.09554462134838104, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6855571866035461, "step": 2180 }, { "clip_ratio": 0.0, "completion_length": 336.46875, "epoch": 1.0654296875, "grad_norm": 0.5930383971596412, "kl": 0.059814453125, "learning_rate": 7.337646484375e-07, "loss": 0.0024, "reward": 1.9693381786346436, "reward_std": 0.1475313939154148, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 1.000588208436966, "step": 2181 }, { "clip_ratio": 0.0, "completion_length": 311.2890625, "epoch": 1.06591796875, "grad_norm": 0.8776224591700195, "kl": 0.0567626953125, "learning_rate": 7.33642578125e-07, "loss": 0.0023, "reward": 1.7541506886482239, "reward_std": 0.04516553692519665, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7541506886482239, "step": 2182 }, { "clip_ratio": 0.0, "completion_length": 350.109375, "epoch": 1.06640625, "grad_norm": 1.6024822866707846, "kl": 0.0609130859375, "learning_rate": 7.335205078125e-07, "loss": 0.0024, "reward": 1.7111125588417053, "reward_std": 0.055892692878842354, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7111125588417053, "step": 2183 }, { "clip_ratio": 0.0, "completion_length": 436.7421875, "epoch": 1.06689453125, "grad_norm": 1.2580164449344338, "kl": 0.070068359375, "learning_rate": 7.333984374999999e-07, "loss": 0.0028, "reward": 1.823473334312439, "reward_std": 0.0914062550291419, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8312858641147614, "step": 2184 }, { "clip_ratio": 0.0, "completion_length": 320.5625, "epoch": 1.0673828125, "grad_norm": 2.0897595557191124, "kl": 0.0743408203125, "learning_rate": 7.332763671874999e-07, "loss": 0.003, "reward": 1.7943394780158997, "reward_std": 0.071324672549963, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7943394482135773, "step": 2185 }, { "clip_ratio": 0.0, "completion_length": 370.859375, "epoch": 1.06787109375, "grad_norm": 1.711752883290858, "kl": 0.072509765625, "learning_rate": 7.33154296875e-07, "loss": 0.0029, "reward": 1.6719991564750671, "reward_std": 0.05058279260993004, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6719991564750671, "step": 2186 }, { "clip_ratio": 0.0, "completion_length": 349.8359375, "epoch": 1.068359375, "grad_norm": 1.7191611604780395, "kl": 0.064453125, "learning_rate": 7.330322265625e-07, "loss": 0.0026, "reward": 1.7175182700157166, "reward_std": 0.08080036751925945, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7253307402133942, "step": 2187 }, { "clip_ratio": 0.0, "completion_length": 221.0859375, "epoch": 1.06884765625, "grad_norm": 1.5113242303718457, "kl": 0.0703125, "learning_rate": 7.3291015625e-07, "loss": 0.0028, "reward": 1.8223052620887756, "reward_std": 0.03729821881279349, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8223052620887756, "step": 2188 }, { "clip_ratio": 0.0, "completion_length": 319.984375, "epoch": 1.0693359375, "grad_norm": 1.8504864852407057, "kl": 0.072998046875, "learning_rate": 7.327880859375e-07, "loss": 0.0029, "reward": 1.8702041506767273, "reward_std": 0.08449077978730202, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8702041506767273, "step": 2189 }, { "clip_ratio": 0.0, "completion_length": 318.265625, "epoch": 1.06982421875, "grad_norm": 1.8618054773039303, "kl": 0.08642578125, "learning_rate": 7.326660156249999e-07, "loss": 0.0035, "reward": 1.7093619108200073, "reward_std": 0.04153325408697128, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7093620002269745, "step": 2190 }, { "clip_ratio": 0.0, "completion_length": 380.109375, "epoch": 1.0703125, "grad_norm": 2.721104333800809, "kl": 0.078125, "learning_rate": 7.325439453124999e-07, "loss": 0.0031, "reward": 1.7475308179855347, "reward_std": 0.06819172203540802, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7475307881832123, "step": 2191 }, { "clip_ratio": 0.0, "completion_length": 300.4609375, "epoch": 1.07080078125, "grad_norm": 1.1564680853165057, "kl": 0.0570068359375, "learning_rate": 7.32421875e-07, "loss": 0.0023, "reward": 1.8769598603248596, "reward_std": 0.027791874017566442, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8769599199295044, "step": 2192 }, { "clip_ratio": 0.0, "completion_length": 417.421875, "epoch": 1.0712890625, "grad_norm": 7.862477096986793, "kl": 0.0655517578125, "learning_rate": 7.322998046875e-07, "loss": 0.0026, "reward": 1.7873188257217407, "reward_std": 0.1392914056777954, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8185688853263855, "step": 2193 }, { "clip_ratio": 0.0, "completion_length": 268.3359375, "epoch": 1.07177734375, "grad_norm": 2.5077562517083343, "kl": 0.071044921875, "learning_rate": 7.32177734375e-07, "loss": 0.0028, "reward": 1.8010922074317932, "reward_std": 0.0915432795882225, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8010921478271484, "step": 2194 }, { "clip_ratio": 0.0, "completion_length": 290.109375, "epoch": 1.072265625, "grad_norm": 1.8689579905691132, "kl": 0.075927734375, "learning_rate": 7.320556640625e-07, "loss": 0.003, "reward": 1.7513406872749329, "reward_std": 0.09631795436143875, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7513406872749329, "step": 2195 }, { "clip_ratio": 0.0, "completion_length": 416.1640625, "epoch": 1.07275390625, "grad_norm": 1.7191972586224031, "kl": 0.0577392578125, "learning_rate": 7.3193359375e-07, "loss": 0.0023, "reward": 1.7447272539138794, "reward_std": 0.16263741254806519, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7759771645069122, "step": 2196 }, { "clip_ratio": 0.0, "completion_length": 321.796875, "epoch": 1.0732421875, "grad_norm": 8.308076466127147, "kl": 0.071533203125, "learning_rate": 7.318115234374999e-07, "loss": 0.0029, "reward": 1.70778489112854, "reward_std": 0.11081130802631378, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.71559739112854, "step": 2197 }, { "clip_ratio": 0.0, "completion_length": 345.90625, "epoch": 1.07373046875, "grad_norm": 5.039129274024845, "kl": 0.075439453125, "learning_rate": 7.316894531249999e-07, "loss": 0.003, "reward": 1.765058159828186, "reward_std": 0.10599537566304207, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7728706300258636, "step": 2198 }, { "clip_ratio": 0.0, "completion_length": 333.2109375, "epoch": 1.07421875, "grad_norm": 2.6078909888453574, "kl": 0.071533203125, "learning_rate": 7.315673828125e-07, "loss": 0.0029, "reward": 1.7972966432571411, "reward_std": 0.05169523321092129, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7972966134548187, "step": 2199 }, { "clip_ratio": 0.0, "completion_length": 296.546875, "epoch": 1.07470703125, "grad_norm": 2.26953245897716, "kl": 0.069580078125, "learning_rate": 7.314453125e-07, "loss": 0.0028, "reward": 1.7383949160575867, "reward_std": 0.08352330699563026, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7540199458599091, "step": 2200 }, { "clip_ratio": 0.0, "completion_length": 356.9375, "epoch": 1.0751953125, "grad_norm": 1.6816262192340925, "kl": 0.0643310546875, "learning_rate": 7.313232421875e-07, "loss": 0.0026, "reward": 1.6924183368682861, "reward_std": 0.06870663538575172, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6924182772636414, "step": 2201 }, { "clip_ratio": 0.0, "completion_length": 377.75, "epoch": 1.07568359375, "grad_norm": 4.603195904454922, "kl": 0.066162109375, "learning_rate": 7.31201171875e-07, "loss": 0.0026, "reward": 1.8771857619285583, "reward_std": 0.15731638204306364, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.900623232126236, "step": 2202 }, { "clip_ratio": 0.0, "completion_length": 374.4921875, "epoch": 1.076171875, "grad_norm": 9.605729154350279, "kl": 0.0625, "learning_rate": 7.310791015624999e-07, "loss": 0.0025, "reward": 1.7749759554862976, "reward_std": 0.17696334049105644, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.80622598528862, "step": 2203 }, { "clip_ratio": 0.0, "completion_length": 285.359375, "epoch": 1.07666015625, "grad_norm": 2.172820429250745, "kl": 0.0635986328125, "learning_rate": 7.309570312499999e-07, "loss": 0.0025, "reward": 1.7500771880149841, "reward_std": 0.04752637818455696, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7500771284103394, "step": 2204 }, { "clip_ratio": 0.0, "completion_length": 298.8984375, "epoch": 1.0771484375, "grad_norm": 1.4018219648742998, "kl": 0.07421875, "learning_rate": 7.308349609375e-07, "loss": 0.003, "reward": 1.7220321893692017, "reward_std": 0.11039461940526962, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7298446595668793, "step": 2205 }, { "clip_ratio": 0.0, "completion_length": 282.5546875, "epoch": 1.07763671875, "grad_norm": 0.8089076092208608, "kl": 0.064208984375, "learning_rate": 7.30712890625e-07, "loss": 0.0026, "reward": 1.7933659553527832, "reward_std": 0.04852168867364526, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.793365865945816, "step": 2206 }, { "clip_ratio": 0.0, "completion_length": 387.34375, "epoch": 1.078125, "grad_norm": 1.005591084680332, "kl": 0.0633544921875, "learning_rate": 7.305908203125e-07, "loss": 0.0025, "reward": 1.637376844882965, "reward_std": 0.13998160883784294, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6686268448829651, "step": 2207 }, { "clip_ratio": 0.0, "completion_length": 269.0546875, "epoch": 1.07861328125, "grad_norm": 0.8635824378840103, "kl": 0.06103515625, "learning_rate": 7.3046875e-07, "loss": 0.0024, "reward": 1.8138604164123535, "reward_std": 0.016972179524600506, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8138603866100311, "step": 2208 }, { "clip_ratio": 0.0, "completion_length": 283.828125, "epoch": 1.0791015625, "grad_norm": 4.198636543673609, "kl": 0.0733642578125, "learning_rate": 7.303466796875e-07, "loss": 0.0029, "reward": 1.786705732345581, "reward_std": 0.08687588106840849, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.794518232345581, "step": 2209 }, { "clip_ratio": 0.0, "completion_length": 296.015625, "epoch": 1.07958984375, "grad_norm": 3.47696025291616, "kl": 0.0675048828125, "learning_rate": 7.302246093749999e-07, "loss": 0.0027, "reward": 1.8078295588493347, "reward_std": 0.08413361757993698, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8156421184539795, "step": 2210 }, { "clip_ratio": 0.0, "completion_length": 238.359375, "epoch": 1.080078125, "grad_norm": 3.8183450251736746, "kl": 0.083740234375, "learning_rate": 7.301025390624999e-07, "loss": 0.0034, "reward": 1.746371567249298, "reward_std": 0.06931864097714424, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7463716566562653, "step": 2211 }, { "clip_ratio": 0.0, "completion_length": 358.0625, "epoch": 1.08056640625, "grad_norm": 4.84020947064792, "kl": 0.06640625, "learning_rate": 7.2998046875e-07, "loss": 0.0027, "reward": 1.7851145267486572, "reward_std": 0.10719123855233192, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.816364586353302, "step": 2212 }, { "clip_ratio": 0.0, "completion_length": 382.9765625, "epoch": 1.0810546875, "grad_norm": 1.9009974118221236, "kl": 0.0609130859375, "learning_rate": 7.298583984375e-07, "loss": 0.0024, "reward": 1.6994649171829224, "reward_std": 0.0930807814002037, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7072774171829224, "step": 2213 }, { "clip_ratio": 0.0, "completion_length": 344.7890625, "epoch": 1.08154296875, "grad_norm": 8.11275335382126, "kl": 0.068115234375, "learning_rate": 7.29736328125e-07, "loss": 0.0027, "reward": 1.7040226459503174, "reward_std": 0.13488183170557022, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7274601459503174, "step": 2214 }, { "clip_ratio": 0.0, "completion_length": 259.3984375, "epoch": 1.08203125, "grad_norm": 2.137119825015467, "kl": 0.07373046875, "learning_rate": 7.296142578125e-07, "loss": 0.0029, "reward": 1.944073498249054, "reward_std": 0.17653799057006836, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.9518861174583435, "step": 2215 }, { "clip_ratio": 0.0, "completion_length": 277.9375, "epoch": 1.08251953125, "grad_norm": 1.1241963585520527, "kl": 0.072021484375, "learning_rate": 7.294921874999999e-07, "loss": 0.0029, "reward": 1.6532188653945923, "reward_std": 0.06231350637972355, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6610313355922699, "step": 2216 }, { "clip_ratio": 0.0, "completion_length": 239.546875, "epoch": 1.0830078125, "grad_norm": 2.7140903444876523, "kl": 0.0634765625, "learning_rate": 7.293701171874999e-07, "loss": 0.0025, "reward": 1.8621540069580078, "reward_std": 0.038842491805553436, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8621540367603302, "step": 2217 }, { "clip_ratio": 0.0, "completion_length": 217.359375, "epoch": 1.08349609375, "grad_norm": 2.191163607204704, "kl": 0.063720703125, "learning_rate": 7.29248046875e-07, "loss": 0.0026, "reward": 1.8771589994430542, "reward_std": 0.025433420203626156, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8771590292453766, "step": 2218 }, { "clip_ratio": 0.0, "completion_length": 342.3125, "epoch": 1.083984375, "grad_norm": 3.2019990973459733, "kl": 0.064453125, "learning_rate": 7.291259765625e-07, "loss": 0.0026, "reward": 1.6369213461875916, "reward_std": 0.11391383782029152, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6447338461875916, "step": 2219 }, { "clip_ratio": 0.0, "completion_length": 292.75, "epoch": 1.08447265625, "grad_norm": 1.2457890025032394, "kl": 0.07177734375, "learning_rate": 7.2900390625e-07, "loss": 0.0029, "reward": 1.7395640015602112, "reward_std": 0.06605142541229725, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7395639717578888, "step": 2220 }, { "clip_ratio": 0.0, "completion_length": 337.578125, "epoch": 1.0849609375, "grad_norm": 2.786622632317203, "kl": 0.073974609375, "learning_rate": 7.288818359375e-07, "loss": 0.003, "reward": 1.773886501789093, "reward_std": 0.06187342666089535, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7738864719867706, "step": 2221 }, { "clip_ratio": 0.0, "completion_length": 330.6015625, "epoch": 1.08544921875, "grad_norm": 1.4337096043043174, "kl": 0.067626953125, "learning_rate": 7.28759765625e-07, "loss": 0.0027, "reward": 1.717581033706665, "reward_std": 0.03319558780640364, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7175810039043427, "step": 2222 }, { "clip_ratio": 0.0, "completion_length": 276.96875, "epoch": 1.0859375, "grad_norm": 1.5681450356473852, "kl": 0.056640625, "learning_rate": 7.286376953124999e-07, "loss": 0.0023, "reward": 1.812729299068451, "reward_std": 0.07597517222166061, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8127292990684509, "step": 2223 }, { "clip_ratio": 0.0, "completion_length": 237.9921875, "epoch": 1.08642578125, "grad_norm": 1.4098337592431196, "kl": 0.07763671875, "learning_rate": 7.285156249999999e-07, "loss": 0.0031, "reward": 1.8502396941184998, "reward_std": 0.03288627602159977, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8502396941184998, "step": 2224 }, { "clip_ratio": 0.0, "completion_length": 250.7421875, "epoch": 1.0869140625, "grad_norm": 1.078144462327901, "kl": 0.071044921875, "learning_rate": 7.283935546875e-07, "loss": 0.0028, "reward": 1.8840885162353516, "reward_std": 0.07063583564013243, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8919010162353516, "step": 2225 }, { "clip_ratio": 0.0, "completion_length": 360.9140625, "epoch": 1.08740234375, "grad_norm": 3.635268117383236, "kl": 0.0703125, "learning_rate": 7.28271484375e-07, "loss": 0.0028, "reward": 1.791632056236267, "reward_std": 0.11924531310796738, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8150696158409119, "step": 2226 }, { "clip_ratio": 0.0, "completion_length": 283.9453125, "epoch": 1.087890625, "grad_norm": 2.4520320812851817, "kl": 0.070068359375, "learning_rate": 7.281494140625e-07, "loss": 0.0028, "reward": 1.7973357439041138, "reward_std": 0.0609661303460598, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7973355948925018, "step": 2227 }, { "clip_ratio": 0.0, "completion_length": 308.953125, "epoch": 1.08837890625, "grad_norm": 3.1691940235472074, "kl": 0.08837890625, "learning_rate": 7.2802734375e-07, "loss": 0.0035, "reward": 1.6005699038505554, "reward_std": 0.058797843754291534, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6083824634552002, "step": 2228 }, { "clip_ratio": 0.0, "completion_length": 287.8359375, "epoch": 1.0888671875, "grad_norm": 1.4854856618683832, "kl": 0.069091796875, "learning_rate": 7.279052734374999e-07, "loss": 0.0028, "reward": 1.8553495407104492, "reward_std": 0.07698746025562286, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8553495407104492, "step": 2229 }, { "clip_ratio": 0.0, "completion_length": 343.7734375, "epoch": 1.08935546875, "grad_norm": 1.6672821205576769, "kl": 0.079345703125, "learning_rate": 7.277832031249999e-07, "loss": 0.0032, "reward": 1.7386040091514587, "reward_std": 0.09536767937242985, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7542290091514587, "step": 2230 }, { "clip_ratio": 0.0, "completion_length": 345.9296875, "epoch": 1.08984375, "grad_norm": 3.060835361852339, "kl": 0.0650634765625, "learning_rate": 7.276611328125e-07, "loss": 0.0026, "reward": 1.7035446763038635, "reward_std": 0.10484276339411736, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7426071465015411, "step": 2231 }, { "clip_ratio": 0.0, "completion_length": 248.453125, "epoch": 1.09033203125, "grad_norm": 2.383603260793994, "kl": 0.0888671875, "learning_rate": 7.275390625e-07, "loss": 0.0036, "reward": 1.7988132238388062, "reward_std": 0.07059590518474579, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7988132238388062, "step": 2232 }, { "clip_ratio": 0.0, "completion_length": 266.7421875, "epoch": 1.0908203125, "grad_norm": 1.4816291696715755, "kl": 0.0946044921875, "learning_rate": 7.274169921875e-07, "loss": 0.0038, "reward": 1.8334471583366394, "reward_std": 0.08519222773611546, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.856884628534317, "step": 2233 }, { "clip_ratio": 0.0, "completion_length": 242.78125, "epoch": 1.09130859375, "grad_norm": 1.8894162923830333, "kl": 0.07861328125, "learning_rate": 7.27294921875e-07, "loss": 0.0031, "reward": 1.6886449456214905, "reward_std": 0.09802973223850131, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7042699754238129, "step": 2234 }, { "clip_ratio": 0.0, "completion_length": 241.0, "epoch": 1.091796875, "grad_norm": 1.062327646687958, "kl": 0.0654296875, "learning_rate": 7.271728515625e-07, "loss": 0.0026, "reward": 1.7099770307540894, "reward_std": 0.0585494851693511, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7099769711494446, "step": 2235 }, { "clip_ratio": 0.0, "completion_length": 356.640625, "epoch": 1.09228515625, "grad_norm": 2.6614706971362003, "kl": 0.0672607421875, "learning_rate": 7.270507812499999e-07, "loss": 0.0027, "reward": 1.5697939991950989, "reward_std": 0.12034578062593937, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5854189693927765, "step": 2236 }, { "clip_ratio": 0.0, "completion_length": 282.5390625, "epoch": 1.0927734375, "grad_norm": 3.4880008369377125, "kl": 0.084228515625, "learning_rate": 7.269287109374999e-07, "loss": 0.0034, "reward": 1.8251853585243225, "reward_std": 0.03789713280275464, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8251853585243225, "step": 2237 }, { "clip_ratio": 0.0, "completion_length": 273.890625, "epoch": 1.09326171875, "grad_norm": 1.8948511963642394, "kl": 0.091064453125, "learning_rate": 7.26806640625e-07, "loss": 0.0036, "reward": 1.7872638702392578, "reward_std": 0.08164054993540049, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8028888702392578, "step": 2238 }, { "clip_ratio": 0.0, "completion_length": 300.71875, "epoch": 1.09375, "grad_norm": 1.391838149766285, "kl": 0.091796875, "learning_rate": 7.266845703125e-07, "loss": 0.0037, "reward": 1.7387813925743103, "reward_std": 0.18217945843935013, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7622189223766327, "step": 2239 }, { "clip_ratio": 0.0, "completion_length": 294.0546875, "epoch": 1.09423828125, "grad_norm": 0.7556167192817572, "kl": 0.083740234375, "learning_rate": 7.265625e-07, "loss": 0.0033, "reward": 1.7732893228530884, "reward_std": 0.03759356215596199, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7732893526554108, "step": 2240 }, { "clip_ratio": 0.0, "completion_length": 324.28125, "epoch": 1.0947265625, "grad_norm": 2.16796468008093, "kl": 0.079833984375, "learning_rate": 7.264404296875e-07, "loss": 0.0032, "reward": 1.8013297319412231, "reward_std": 0.15706830099225044, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8247672319412231, "step": 2241 }, { "clip_ratio": 0.0, "completion_length": 403.2578125, "epoch": 1.09521484375, "grad_norm": 0.929112376338198, "kl": 0.065185546875, "learning_rate": 7.263183593749999e-07, "loss": 0.0026, "reward": 1.570958137512207, "reward_std": 0.17315081879496574, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.602208137512207, "step": 2242 }, { "clip_ratio": 0.0, "completion_length": 310.65625, "epoch": 1.095703125, "grad_norm": 1.2746975120920747, "kl": 0.0648193359375, "learning_rate": 7.261962890624999e-07, "loss": 0.0026, "reward": 1.7777118682861328, "reward_std": 0.06291940249502659, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7777118384838104, "step": 2243 }, { "clip_ratio": 0.0, "completion_length": 315.734375, "epoch": 1.09619140625, "grad_norm": 1.7464595790772883, "kl": 0.070556640625, "learning_rate": 7.2607421875e-07, "loss": 0.0028, "reward": 1.8250086903572083, "reward_std": 0.1407541036605835, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8406336605548859, "step": 2244 }, { "clip_ratio": 0.0, "completion_length": 329.875, "epoch": 1.0966796875, "grad_norm": 1.8014959178143817, "kl": 0.0888671875, "learning_rate": 7.259521484375e-07, "loss": 0.0035, "reward": 1.7720280885696411, "reward_std": 0.1236952543258667, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7798406481742859, "step": 2245 }, { "clip_ratio": 0.0, "completion_length": 291.4765625, "epoch": 1.09716796875, "grad_norm": 1.692290936562916, "kl": 0.07861328125, "learning_rate": 7.25830078125e-07, "loss": 0.0032, "reward": 1.8413395881652832, "reward_std": 0.05821367911994457, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8413394689559937, "step": 2246 }, { "clip_ratio": 0.0, "completion_length": 369.640625, "epoch": 1.09765625, "grad_norm": 3.439561683238805, "kl": 0.0521240234375, "learning_rate": 7.257080078125e-07, "loss": 0.0021, "reward": 1.773597240447998, "reward_std": 0.060536185279488564, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7735972106456757, "step": 2247 }, { "clip_ratio": 0.0, "completion_length": 369.5703125, "epoch": 1.09814453125, "grad_norm": 3.0080579318285294, "kl": 0.072021484375, "learning_rate": 7.255859375e-07, "loss": 0.0029, "reward": 1.6555711030960083, "reward_std": 0.08909568935632706, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6555710732936859, "step": 2248 }, { "clip_ratio": 0.0, "completion_length": 338.2578125, "epoch": 1.0986328125, "grad_norm": 4.282105322707064, "kl": 0.0579833984375, "learning_rate": 7.254638671874999e-07, "loss": 0.0023, "reward": 1.7793956995010376, "reward_std": 0.0656171664595604, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7793957591056824, "step": 2249 }, { "clip_ratio": 0.0, "completion_length": 319.7578125, "epoch": 1.09912109375, "grad_norm": 1.1360255555472931, "kl": 0.079345703125, "learning_rate": 7.253417968749999e-07, "loss": 0.0032, "reward": 1.7589207887649536, "reward_std": 0.16139283776283264, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7823582887649536, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 376.578125, "epoch": 1.099609375, "grad_norm": 1.5781494589643414, "kl": 0.06640625, "learning_rate": 7.252197265625e-07, "loss": 0.0027, "reward": 1.781304121017456, "reward_std": 0.15949422121047974, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8047415614128113, "step": 2251 }, { "clip_ratio": 0.0, "completion_length": 354.1484375, "epoch": 1.10009765625, "grad_norm": 2.2259355754649603, "kl": 0.0631103515625, "learning_rate": 7.2509765625e-07, "loss": 0.0025, "reward": 1.8216727375984192, "reward_std": 0.07048023492097855, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8216726779937744, "step": 2252 }, { "clip_ratio": 0.0, "completion_length": 278.328125, "epoch": 1.1005859375, "grad_norm": 2.982676387228891, "kl": 0.071533203125, "learning_rate": 7.249755859375e-07, "loss": 0.0029, "reward": 1.5965590476989746, "reward_std": 0.10238468833267689, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6043716222047806, "step": 2253 }, { "clip_ratio": 0.0, "completion_length": 372.8671875, "epoch": 1.10107421875, "grad_norm": 1.865700837610057, "kl": 0.0584716796875, "learning_rate": 7.24853515625e-07, "loss": 0.0023, "reward": 1.6730265617370605, "reward_std": 0.1227874830365181, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6964640319347382, "step": 2254 }, { "clip_ratio": 0.0, "completion_length": 337.9453125, "epoch": 1.1015625, "grad_norm": 3.122567250012638, "kl": 0.07275390625, "learning_rate": 7.247314453125e-07, "loss": 0.0029, "reward": 1.7611711025238037, "reward_std": 0.10396287217736244, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7767961025238037, "step": 2255 }, { "clip_ratio": 0.0, "completion_length": 285.015625, "epoch": 1.10205078125, "grad_norm": 0.9976986455436946, "kl": 0.06396484375, "learning_rate": 7.246093749999999e-07, "loss": 0.0026, "reward": 1.740997850894928, "reward_std": 0.07786927185952663, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7409978210926056, "step": 2256 }, { "clip_ratio": 0.0, "completion_length": 316.4453125, "epoch": 1.1025390625, "grad_norm": 1.1672582198818704, "kl": 0.065185546875, "learning_rate": 7.244873046874999e-07, "loss": 0.0026, "reward": 1.817870855331421, "reward_std": 0.0449886042624712, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8178708851337433, "step": 2257 }, { "clip_ratio": 0.0, "completion_length": 290.8203125, "epoch": 1.10302734375, "grad_norm": 3.1200822986848706, "kl": 0.0601806640625, "learning_rate": 7.24365234375e-07, "loss": 0.0024, "reward": 1.7606118321418762, "reward_std": 0.05959334224462509, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7684243321418762, "step": 2258 }, { "clip_ratio": 0.0, "completion_length": 298.4921875, "epoch": 1.103515625, "grad_norm": 3.7522311592920388, "kl": 0.0587158203125, "learning_rate": 7.242431640625e-07, "loss": 0.0024, "reward": 1.800333023071289, "reward_std": 0.08324110694229603, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8081456422805786, "step": 2259 }, { "clip_ratio": 0.0, "completion_length": 288.1484375, "epoch": 1.10400390625, "grad_norm": 2.944245123139588, "kl": 0.05859375, "learning_rate": 7.2412109375e-07, "loss": 0.0023, "reward": 1.8839264512062073, "reward_std": 0.0873430147767067, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8839264810085297, "step": 2260 }, { "clip_ratio": 0.0, "completion_length": 350.4765625, "epoch": 1.1044921875, "grad_norm": 1.7540471861864702, "kl": 0.0521240234375, "learning_rate": 7.239990234375e-07, "loss": 0.0021, "reward": 1.7210676670074463, "reward_std": 0.14550930261611938, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7445051968097687, "step": 2261 }, { "clip_ratio": 0.0, "completion_length": 296.7265625, "epoch": 1.10498046875, "grad_norm": 3.2632375700049128, "kl": 0.0548095703125, "learning_rate": 7.238769531249999e-07, "loss": 0.0022, "reward": 1.754812240600586, "reward_std": 0.04835915379226208, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7548122107982635, "step": 2262 }, { "clip_ratio": 0.0, "completion_length": 273.734375, "epoch": 1.10546875, "grad_norm": 2.9460406884659176, "kl": 0.0577392578125, "learning_rate": 7.237548828124999e-07, "loss": 0.0023, "reward": 1.8352848291397095, "reward_std": 0.16885582357645035, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8665347993373871, "step": 2263 }, { "clip_ratio": 0.0, "completion_length": 384.5, "epoch": 1.10595703125, "grad_norm": 1.0028134561756605, "kl": 0.052001953125, "learning_rate": 7.236328125e-07, "loss": 0.0021, "reward": 1.858300268650055, "reward_std": 0.04617350362241268, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8583002388477325, "step": 2264 }, { "clip_ratio": 0.0, "completion_length": 310.625, "epoch": 1.1064453125, "grad_norm": 4.3938839661401135, "kl": 0.0679931640625, "learning_rate": 7.235107421875e-07, "loss": 0.0027, "reward": 1.7961427569389343, "reward_std": 0.04284539166837931, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7961426973342896, "step": 2265 }, { "clip_ratio": 0.0, "completion_length": 244.609375, "epoch": 1.10693359375, "grad_norm": 1.1672021555808423, "kl": 0.068115234375, "learning_rate": 7.23388671875e-07, "loss": 0.0027, "reward": 1.7131685614585876, "reward_std": 0.05846385471522808, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7131686210632324, "step": 2266 }, { "clip_ratio": 0.0, "completion_length": 372.4453125, "epoch": 1.107421875, "grad_norm": 0.856824937232512, "kl": 0.06591796875, "learning_rate": 7.232666015625e-07, "loss": 0.0026, "reward": 1.7381606698036194, "reward_std": 0.17856748402118683, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7537856698036194, "step": 2267 }, { "clip_ratio": 0.0, "completion_length": 231.5703125, "epoch": 1.10791015625, "grad_norm": 3.9627794439298425, "kl": 0.076416015625, "learning_rate": 7.2314453125e-07, "loss": 0.0031, "reward": 1.6982629299163818, "reward_std": 0.07101480662822723, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6982628703117371, "step": 2268 }, { "clip_ratio": 0.0, "completion_length": 311.5625, "epoch": 1.1083984375, "grad_norm": 1.914687836920094, "kl": 0.072509765625, "learning_rate": 7.230224609374999e-07, "loss": 0.0029, "reward": 1.6740421056747437, "reward_std": 0.10210954397916794, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6740420460700989, "step": 2269 }, { "clip_ratio": 0.0, "completion_length": 322.359375, "epoch": 1.10888671875, "grad_norm": 1.1215367514845478, "kl": 0.073486328125, "learning_rate": 7.229003906249999e-07, "loss": 0.0029, "reward": 1.6971306204795837, "reward_std": 0.11026806011795998, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7049430906772614, "step": 2270 }, { "clip_ratio": 0.0, "completion_length": 225.265625, "epoch": 1.109375, "grad_norm": 1.1173595552235849, "kl": 0.0670166015625, "learning_rate": 7.227783203125e-07, "loss": 0.0027, "reward": 1.8012632131576538, "reward_std": 0.062459973618388176, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8090757131576538, "step": 2271 }, { "clip_ratio": 0.0, "completion_length": 304.8984375, "epoch": 1.10986328125, "grad_norm": 0.7894205502775127, "kl": 0.0615234375, "learning_rate": 7.2265625e-07, "loss": 0.0025, "reward": 1.776045799255371, "reward_std": 0.04911462590098381, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7760457992553711, "step": 2272 }, { "clip_ratio": 0.0, "completion_length": 338.125, "epoch": 1.1103515625, "grad_norm": 1.0027845954069041, "kl": 0.078857421875, "learning_rate": 7.225341796875e-07, "loss": 0.0032, "reward": 1.722011387348175, "reward_std": 0.07575457729399204, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7220114171504974, "step": 2273 }, { "clip_ratio": 0.0, "completion_length": 313.453125, "epoch": 1.11083984375, "grad_norm": 10.409760523516997, "kl": 0.05712890625, "learning_rate": 7.22412109375e-07, "loss": 0.0023, "reward": 1.7924315929412842, "reward_std": 0.08167605847120285, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7924315631389618, "step": 2274 }, { "clip_ratio": 0.0, "completion_length": 296.5625, "epoch": 1.111328125, "grad_norm": 2.834746616866118, "kl": 0.0628662109375, "learning_rate": 7.222900390624999e-07, "loss": 0.0025, "reward": 1.8050841689109802, "reward_std": 0.07527113519608974, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8128966391086578, "step": 2275 }, { "clip_ratio": 0.0, "completion_length": 270.0625, "epoch": 1.11181640625, "grad_norm": 2.7173474996855993, "kl": 0.0703125, "learning_rate": 7.221679687499999e-07, "loss": 0.0028, "reward": 1.7315622568130493, "reward_std": 0.06561515107750893, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7393746674060822, "step": 2276 }, { "clip_ratio": 0.0, "completion_length": 268.0, "epoch": 1.1123046875, "grad_norm": 1.4927968621455245, "kl": 0.068603515625, "learning_rate": 7.220458984375e-07, "loss": 0.0027, "reward": 1.8361743092536926, "reward_std": 0.03312414512038231, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8361742496490479, "step": 2277 }, { "clip_ratio": 0.0, "completion_length": 254.2734375, "epoch": 1.11279296875, "grad_norm": 3.339265407657078, "kl": 0.080078125, "learning_rate": 7.21923828125e-07, "loss": 0.0032, "reward": 1.728402554988861, "reward_std": 0.13297371938824654, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7284024953842163, "step": 2278 }, { "clip_ratio": 0.0, "completion_length": 301.0625, "epoch": 1.11328125, "grad_norm": 6.372972770256252, "kl": 0.083984375, "learning_rate": 7.218017578125e-07, "loss": 0.0034, "reward": 1.7280917167663574, "reward_std": 0.052303411066532135, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.728091686964035, "step": 2279 }, { "clip_ratio": 0.0, "completion_length": 258.84375, "epoch": 1.11376953125, "grad_norm": 4.351237154663654, "kl": 0.088623046875, "learning_rate": 7.216796875e-07, "loss": 0.0035, "reward": 1.7941367626190186, "reward_std": 0.08051660470664501, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7941367924213409, "step": 2280 }, { "clip_ratio": 0.0, "completion_length": 313.8984375, "epoch": 1.1142578125, "grad_norm": 1.0055680467034633, "kl": 0.0589599609375, "learning_rate": 7.215576171875e-07, "loss": 0.0024, "reward": 1.7715181112289429, "reward_std": 0.07638098672032356, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7715181410312653, "step": 2281 }, { "clip_ratio": 0.0, "completion_length": 315.2421875, "epoch": 1.11474609375, "grad_norm": 2.028756106382298, "kl": 0.067626953125, "learning_rate": 7.214355468749999e-07, "loss": 0.0027, "reward": 1.8816375732421875, "reward_std": 0.13509927690029144, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8816376030445099, "step": 2282 }, { "clip_ratio": 0.0, "completion_length": 293.8515625, "epoch": 1.115234375, "grad_norm": 1.7934862249340708, "kl": 0.070068359375, "learning_rate": 7.213134765624999e-07, "loss": 0.0028, "reward": 1.7188656330108643, "reward_std": 0.05575054790824652, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7188656330108643, "step": 2283 }, { "clip_ratio": 0.0, "completion_length": 253.9140625, "epoch": 1.11572265625, "grad_norm": 0.7312070262378836, "kl": 0.07373046875, "learning_rate": 7.2119140625e-07, "loss": 0.003, "reward": 1.8175573348999023, "reward_std": 0.01690027490258217, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.81755730509758, "step": 2284 }, { "clip_ratio": 0.0, "completion_length": 278.078125, "epoch": 1.1162109375, "grad_norm": 2.030078969268682, "kl": 0.05712890625, "learning_rate": 7.210693359375e-07, "loss": 0.0023, "reward": 1.7928734421730042, "reward_std": 0.06959575228393078, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7928734421730042, "step": 2285 }, { "clip_ratio": 0.0, "completion_length": 301.765625, "epoch": 1.11669921875, "grad_norm": 1.6694669880542703, "kl": 0.085205078125, "learning_rate": 7.20947265625e-07, "loss": 0.0034, "reward": 1.8147171139717102, "reward_std": 0.11030293442308903, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8303420841693878, "step": 2286 }, { "clip_ratio": 0.0, "completion_length": 297.234375, "epoch": 1.1171875, "grad_norm": 0.9274138439502736, "kl": 0.06103515625, "learning_rate": 7.208251953125e-07, "loss": 0.0024, "reward": 1.8761171102523804, "reward_std": 0.04183840565383434, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8761171698570251, "step": 2287 }, { "clip_ratio": 0.0, "completion_length": 275.671875, "epoch": 1.11767578125, "grad_norm": 13.021849727212583, "kl": 0.07421875, "learning_rate": 7.207031249999999e-07, "loss": 0.003, "reward": 1.7165254354476929, "reward_std": 0.092881940305233, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7243379950523376, "step": 2288 }, { "clip_ratio": 0.0, "completion_length": 254.03125, "epoch": 1.1181640625, "grad_norm": 1.334539370731693, "kl": 0.08642578125, "learning_rate": 7.205810546874999e-07, "loss": 0.0035, "reward": 1.8177083730697632, "reward_std": 0.09210111945867538, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8177083432674408, "step": 2289 }, { "clip_ratio": 0.0, "completion_length": 347.1796875, "epoch": 1.11865234375, "grad_norm": 1.9031635634435449, "kl": 0.0635986328125, "learning_rate": 7.20458984375e-07, "loss": 0.0025, "reward": 1.652937114238739, "reward_std": 0.07407059520483017, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6529370546340942, "step": 2290 }, { "clip_ratio": 0.0, "completion_length": 267.296875, "epoch": 1.119140625, "grad_norm": 0.8218601392312246, "kl": 0.060791015625, "learning_rate": 7.203369140625e-07, "loss": 0.0024, "reward": 1.813718318939209, "reward_std": 0.07145040668547153, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.813718318939209, "step": 2291 }, { "clip_ratio": 0.0, "completion_length": 287.5703125, "epoch": 1.11962890625, "grad_norm": 1.467119269254602, "kl": 0.0623779296875, "learning_rate": 7.2021484375e-07, "loss": 0.0025, "reward": 1.740653932094574, "reward_std": 0.08345598913729191, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.740653932094574, "step": 2292 }, { "clip_ratio": 0.0, "completion_length": 306.6328125, "epoch": 1.1201171875, "grad_norm": 1.0241103270793768, "kl": 0.0494384765625, "learning_rate": 7.200927734375e-07, "loss": 0.002, "reward": 1.7661115527153015, "reward_std": 0.14179787784814835, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7973615527153015, "step": 2293 }, { "clip_ratio": 0.0, "completion_length": 327.765625, "epoch": 1.12060546875, "grad_norm": 2.4337023505740434, "kl": 0.057373046875, "learning_rate": 7.19970703125e-07, "loss": 0.0023, "reward": 1.7818644642829895, "reward_std": 0.13211066648364067, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7896769940853119, "step": 2294 }, { "clip_ratio": 0.0, "completion_length": 279.0859375, "epoch": 1.12109375, "grad_norm": 2.159922083493317, "kl": 0.066650390625, "learning_rate": 7.198486328124999e-07, "loss": 0.0027, "reward": 1.7772237658500671, "reward_std": 0.08314445242285728, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7928487956523895, "step": 2295 }, { "clip_ratio": 0.0, "completion_length": 299.1328125, "epoch": 1.12158203125, "grad_norm": 1.7534330818508779, "kl": 0.0537109375, "learning_rate": 7.197265624999999e-07, "loss": 0.0021, "reward": 1.7676746845245361, "reward_std": 0.09902366809546947, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.8067372441291809, "step": 2296 }, { "clip_ratio": 0.0, "completion_length": 273.7734375, "epoch": 1.1220703125, "grad_norm": 1.211513703798563, "kl": 0.04931640625, "learning_rate": 7.196044921875e-07, "loss": 0.002, "reward": 1.797850251197815, "reward_std": 0.07715418934822083, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7978502511978149, "step": 2297 }, { "clip_ratio": 0.0, "completion_length": 230.0703125, "epoch": 1.12255859375, "grad_norm": 1.4152986860468773, "kl": 0.0484619140625, "learning_rate": 7.19482421875e-07, "loss": 0.0019, "reward": 1.9088245630264282, "reward_std": 0.04718828946352005, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.908824622631073, "step": 2298 }, { "clip_ratio": 0.0, "completion_length": 204.6484375, "epoch": 1.123046875, "grad_norm": 1.3403931709546952, "kl": 0.0687255859375, "learning_rate": 7.193603515625e-07, "loss": 0.0027, "reward": 1.7832393050193787, "reward_std": 0.06331999599933624, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7832392752170563, "step": 2299 }, { "clip_ratio": 0.0, "completion_length": 302.2421875, "epoch": 1.12353515625, "grad_norm": 2.1834678287173177, "kl": 0.072998046875, "learning_rate": 7.1923828125e-07, "loss": 0.0029, "reward": 1.7606186866760254, "reward_std": 0.12387818098068237, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7684311270713806, "step": 2300 }, { "clip_ratio": 0.0, "completion_length": 312.921875, "epoch": 1.1240234375, "grad_norm": 1.5319637176279595, "kl": 0.0673828125, "learning_rate": 7.191162109374999e-07, "loss": 0.0027, "reward": 1.6937076449394226, "reward_std": 0.09303374774754047, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6937075853347778, "step": 2301 }, { "clip_ratio": 0.0, "completion_length": 332.09375, "epoch": 1.12451171875, "grad_norm": 2.6885932706099895, "kl": 0.0579833984375, "learning_rate": 7.189941406249999e-07, "loss": 0.0023, "reward": 1.7834733128547668, "reward_std": 0.042033152654767036, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7834733128547668, "step": 2302 }, { "clip_ratio": 0.0, "completion_length": 289.7109375, "epoch": 1.125, "grad_norm": 1.5126183880414965, "kl": 0.0614013671875, "learning_rate": 7.188720703125e-07, "loss": 0.0025, "reward": 1.7206319570541382, "reward_std": 0.03807243797928095, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7206319570541382, "step": 2303 }, { "clip_ratio": 0.0, "completion_length": 268.953125, "epoch": 1.12548828125, "grad_norm": 2.3253192988082416, "kl": 0.08740234375, "learning_rate": 7.1875e-07, "loss": 0.0035, "reward": 1.697283923625946, "reward_std": 0.038683134131133556, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.697283923625946, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 371.3125, "epoch": 1.1259765625, "grad_norm": 1.8971968478232022, "kl": 0.0548095703125, "learning_rate": 7.186279296875e-07, "loss": 0.0022, "reward": 1.5334055423736572, "reward_std": 0.17225759476423264, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.595905601978302, "step": 2305 }, { "clip_ratio": 0.0, "completion_length": 246.265625, "epoch": 1.12646484375, "grad_norm": 2.1324487343914655, "kl": 0.086669921875, "learning_rate": 7.18505859375e-07, "loss": 0.0035, "reward": 1.7237046957015991, "reward_std": 0.059116460382938385, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7237046360969543, "step": 2306 }, { "clip_ratio": 0.0, "completion_length": 251.015625, "epoch": 1.126953125, "grad_norm": 1.8565481261961394, "kl": 0.079833984375, "learning_rate": 7.183837890625e-07, "loss": 0.0032, "reward": 1.5828680992126465, "reward_std": 0.043210539035499096, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5828680694103241, "step": 2307 }, { "clip_ratio": 0.0, "completion_length": 226.15625, "epoch": 1.12744140625, "grad_norm": 0.4105661860693138, "kl": 0.0496826171875, "learning_rate": 7.182617187499999e-07, "loss": 0.002, "reward": 1.7693156599998474, "reward_std": 0.06646117940545082, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7771281003952026, "step": 2308 }, { "clip_ratio": 0.0, "completion_length": 266.3046875, "epoch": 1.1279296875, "grad_norm": 5.888256460420409, "kl": 0.0693359375, "learning_rate": 7.181396484374999e-07, "loss": 0.0028, "reward": 1.7534179091453552, "reward_std": 0.0729428380727768, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7534180283546448, "step": 2309 }, { "clip_ratio": 0.0, "completion_length": 310.0, "epoch": 1.12841796875, "grad_norm": 1.3969601396928006, "kl": 0.0654296875, "learning_rate": 7.18017578125e-07, "loss": 0.0026, "reward": 1.7623464465141296, "reward_std": 0.10179652273654938, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7779714465141296, "step": 2310 }, { "clip_ratio": 0.0, "completion_length": 219.9453125, "epoch": 1.12890625, "grad_norm": 1.5953364275642339, "kl": 0.0555419921875, "learning_rate": 7.178955078125e-07, "loss": 0.0022, "reward": 1.7922866940498352, "reward_std": 0.032217446714639664, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7922867238521576, "step": 2311 }, { "clip_ratio": 0.0, "completion_length": 309.1953125, "epoch": 1.12939453125, "grad_norm": 1.7562620435246095, "kl": 0.06103515625, "learning_rate": 7.177734375e-07, "loss": 0.0024, "reward": 1.7869414687156677, "reward_std": 0.06921962834894657, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7869414389133453, "step": 2312 }, { "clip_ratio": 0.0, "completion_length": 280.8671875, "epoch": 1.1298828125, "grad_norm": 0.6578555181903704, "kl": 0.0479736328125, "learning_rate": 7.176513671875e-07, "loss": 0.0019, "reward": 1.7676367163658142, "reward_std": 0.0514880558475852, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7754492163658142, "step": 2313 }, { "clip_ratio": 0.0, "completion_length": 275.0, "epoch": 1.13037109375, "grad_norm": 6.75545813289656, "kl": 0.0570068359375, "learning_rate": 7.175292968749999e-07, "loss": 0.0023, "reward": 1.7726652026176453, "reward_std": 0.03823528438806534, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7726651132106781, "step": 2314 }, { "clip_ratio": 0.0, "completion_length": 319.8828125, "epoch": 1.130859375, "grad_norm": 1.2309637241993088, "kl": 0.05224609375, "learning_rate": 7.174072265624999e-07, "loss": 0.0021, "reward": 1.7631428241729736, "reward_std": 0.03950107842683792, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7631428837776184, "step": 2315 }, { "clip_ratio": 0.0, "completion_length": 304.3828125, "epoch": 1.13134765625, "grad_norm": 0.886493942657421, "kl": 0.06494140625, "learning_rate": 7.1728515625e-07, "loss": 0.0026, "reward": 1.791369915008545, "reward_std": 0.06947879865765572, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7991823554039001, "step": 2316 }, { "clip_ratio": 0.0, "completion_length": 288.15625, "epoch": 1.1318359375, "grad_norm": 1.6449264544715965, "kl": 0.076171875, "learning_rate": 7.171630859375e-07, "loss": 0.003, "reward": 1.820662021636963, "reward_std": 0.07318861410021782, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8206620216369629, "step": 2317 }, { "clip_ratio": 0.0, "completion_length": 329.15625, "epoch": 1.13232421875, "grad_norm": 2.512895927696398, "kl": 0.095703125, "learning_rate": 7.17041015625e-07, "loss": 0.0038, "reward": 1.6627293825149536, "reward_std": 0.08104284480214119, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.662729412317276, "step": 2318 }, { "clip_ratio": 0.0, "completion_length": 219.03125, "epoch": 1.1328125, "grad_norm": 3.257367159754489, "kl": 0.080810546875, "learning_rate": 7.169189453125e-07, "loss": 0.0032, "reward": 1.7540556192398071, "reward_std": 0.026656273752450943, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7540555894374847, "step": 2319 }, { "clip_ratio": 0.0, "completion_length": 289.09375, "epoch": 1.13330078125, "grad_norm": 2.3206529623904815, "kl": 0.076416015625, "learning_rate": 7.16796875e-07, "loss": 0.0031, "reward": 1.7328737378120422, "reward_std": 0.06623134948313236, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7328737676143646, "step": 2320 }, { "clip_ratio": 0.0, "completion_length": 299.71875, "epoch": 1.1337890625, "grad_norm": 2.7928034489135585, "kl": 0.0673828125, "learning_rate": 7.166748046874999e-07, "loss": 0.0027, "reward": 1.7304103374481201, "reward_std": 0.08190120384097099, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7304103970527649, "step": 2321 }, { "clip_ratio": 0.0, "completion_length": 252.1015625, "epoch": 1.13427734375, "grad_norm": 1.999816042395783, "kl": 0.0556640625, "learning_rate": 7.165527343749999e-07, "loss": 0.0022, "reward": 1.838409960269928, "reward_std": 0.058572327718138695, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.838409960269928, "step": 2322 }, { "clip_ratio": 0.0, "completion_length": 357.8984375, "epoch": 1.134765625, "grad_norm": 1.9833817250272083, "kl": 0.081298828125, "learning_rate": 7.164306640625e-07, "loss": 0.0032, "reward": 1.7137295007705688, "reward_std": 0.1032501645386219, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7137295603752136, "step": 2323 }, { "clip_ratio": 0.0, "completion_length": 218.0078125, "epoch": 1.13525390625, "grad_norm": 1.8188360125282768, "kl": 0.068115234375, "learning_rate": 7.1630859375e-07, "loss": 0.0027, "reward": 1.7000929713249207, "reward_std": 0.02517910674214363, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.700093001127243, "step": 2324 }, { "clip_ratio": 0.0, "completion_length": 326.5703125, "epoch": 1.1357421875, "grad_norm": 8.765129216379982, "kl": 0.0595703125, "learning_rate": 7.161865234375e-07, "loss": 0.0024, "reward": 1.8266154527664185, "reward_std": 0.05450385436415672, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8266153633594513, "step": 2325 }, { "clip_ratio": 0.0, "completion_length": 336.3828125, "epoch": 1.13623046875, "grad_norm": 2.8091258579231586, "kl": 0.0650634765625, "learning_rate": 7.16064453125e-07, "loss": 0.0026, "reward": 1.7895740866661072, "reward_std": 0.13050633668899536, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7973865866661072, "step": 2326 }, { "clip_ratio": 0.0, "completion_length": 272.375, "epoch": 1.13671875, "grad_norm": 0.6840155091097224, "kl": 0.06005859375, "learning_rate": 7.159423828124999e-07, "loss": 0.0024, "reward": 1.7351736426353455, "reward_std": 0.014591011684387922, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7351736426353455, "step": 2327 }, { "clip_ratio": 0.0, "completion_length": 301.015625, "epoch": 1.13720703125, "grad_norm": 1.4187623188678093, "kl": 0.0736083984375, "learning_rate": 7.158203124999999e-07, "loss": 0.0029, "reward": 1.7072078585624695, "reward_std": 0.057367969304323196, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7072078585624695, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 339.71875, "epoch": 1.1376953125, "grad_norm": 0.9798231552523725, "kl": 0.0670166015625, "learning_rate": 7.156982421875e-07, "loss": 0.0027, "reward": 1.818799912929535, "reward_std": 0.05518978089094162, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8187999427318573, "step": 2329 }, { "clip_ratio": 0.0, "completion_length": 271.6875, "epoch": 1.13818359375, "grad_norm": 2.8180791927814703, "kl": 0.057373046875, "learning_rate": 7.15576171875e-07, "loss": 0.0023, "reward": 1.7160167694091797, "reward_std": 0.040740249678492546, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7160168290138245, "step": 2330 }, { "clip_ratio": 0.0, "completion_length": 300.4453125, "epoch": 1.138671875, "grad_norm": 2.169648391438765, "kl": 0.0693359375, "learning_rate": 7.154541015625e-07, "loss": 0.0028, "reward": 1.6581083536148071, "reward_std": 0.11767758429050446, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6581082940101624, "step": 2331 }, { "clip_ratio": 0.0, "completion_length": 353.1875, "epoch": 1.13916015625, "grad_norm": 1.0398806310768511, "kl": 0.056884765625, "learning_rate": 7.1533203125e-07, "loss": 0.0023, "reward": 1.6300272941589355, "reward_std": 0.08064482361078262, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6378397643566132, "step": 2332 }, { "clip_ratio": 0.0, "completion_length": 281.3359375, "epoch": 1.1396484375, "grad_norm": 2.2107422861873403, "kl": 0.0625, "learning_rate": 7.152099609375e-07, "loss": 0.0025, "reward": 1.5821011662483215, "reward_std": 0.18704190105199814, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5899136066436768, "step": 2333 }, { "clip_ratio": 0.0, "completion_length": 391.625, "epoch": 1.14013671875, "grad_norm": 0.8138393830121889, "kl": 0.050048828125, "learning_rate": 7.150878906249999e-07, "loss": 0.002, "reward": 1.7018383741378784, "reward_std": 0.10387471597641706, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7487134337425232, "step": 2334 }, { "clip_ratio": 0.0, "completion_length": 276.2109375, "epoch": 1.140625, "grad_norm": 1.7447653610028946, "kl": 0.0712890625, "learning_rate": 7.149658203124999e-07, "loss": 0.0028, "reward": 1.771507978439331, "reward_std": 0.02958191279321909, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7715080082416534, "step": 2335 }, { "clip_ratio": 0.0, "completion_length": 416.3203125, "epoch": 1.14111328125, "grad_norm": 0.8398042884116915, "kl": 0.04736328125, "learning_rate": 7.1484375e-07, "loss": 0.0019, "reward": 1.6066496968269348, "reward_std": 0.1151208933442831, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6378996670246124, "step": 2336 }, { "clip_ratio": 0.0, "completion_length": 264.6953125, "epoch": 1.1416015625, "grad_norm": 4.216576593319963, "kl": 0.0782470703125, "learning_rate": 7.147216796875e-07, "loss": 0.0031, "reward": 1.6847835779190063, "reward_std": 0.06190246529877186, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6847835183143616, "step": 2337 }, { "clip_ratio": 0.0, "completion_length": 292.1015625, "epoch": 1.14208984375, "grad_norm": 1.4146700885087296, "kl": 0.0638427734375, "learning_rate": 7.14599609375e-07, "loss": 0.0026, "reward": 1.741209328174591, "reward_std": 0.06798835471272469, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7412092983722687, "step": 2338 }, { "clip_ratio": 0.0, "completion_length": 286.3125, "epoch": 1.142578125, "grad_norm": 0.8804028276553788, "kl": 0.0650634765625, "learning_rate": 7.144775390625e-07, "loss": 0.0026, "reward": 1.9364939332008362, "reward_std": 0.07677973434329033, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.9443064332008362, "step": 2339 }, { "clip_ratio": 0.0, "completion_length": 363.8984375, "epoch": 1.14306640625, "grad_norm": 3.528715281639079, "kl": 0.0560302734375, "learning_rate": 7.143554687499999e-07, "loss": 0.0022, "reward": 1.7230896353721619, "reward_std": 0.11856443714350462, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7387146353721619, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 310.4375, "epoch": 1.1435546875, "grad_norm": 2.133887430565937, "kl": 0.103515625, "learning_rate": 7.142333984374999e-07, "loss": 0.0041, "reward": 1.8173925876617432, "reward_std": 0.05221419036388397, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8252050876617432, "step": 2341 }, { "clip_ratio": 0.0, "completion_length": 253.265625, "epoch": 1.14404296875, "grad_norm": 2.0667954941578897, "kl": 0.061767578125, "learning_rate": 7.14111328125e-07, "loss": 0.0025, "reward": 1.8616973161697388, "reward_std": 0.0397907979786396, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8616973757743835, "step": 2342 }, { "clip_ratio": 0.0, "completion_length": 300.5703125, "epoch": 1.14453125, "grad_norm": 1.1764096283379186, "kl": 0.06494140625, "learning_rate": 7.139892578125e-07, "loss": 0.0026, "reward": 1.6104283928871155, "reward_std": 0.1447310373187065, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6338659226894379, "step": 2343 }, { "clip_ratio": 0.0, "completion_length": 308.2265625, "epoch": 1.14501953125, "grad_norm": 1.5286119984509583, "kl": 0.09033203125, "learning_rate": 7.138671875e-07, "loss": 0.0036, "reward": 1.5864279866218567, "reward_std": 0.05582820437848568, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5864280462265015, "step": 2344 }, { "clip_ratio": 0.0, "completion_length": 375.3125, "epoch": 1.1455078125, "grad_norm": 1.7441444479284598, "kl": 0.050537109375, "learning_rate": 7.137451171875e-07, "loss": 0.002, "reward": 1.8168761134147644, "reward_std": 0.10111106187105179, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8246885538101196, "step": 2345 }, { "clip_ratio": 0.0, "completion_length": 310.4609375, "epoch": 1.14599609375, "grad_norm": 0.7495543880714017, "kl": 0.05126953125, "learning_rate": 7.13623046875e-07, "loss": 0.0021, "reward": 1.703368902206421, "reward_std": 0.0950179323554039, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7111814320087433, "step": 2346 }, { "clip_ratio": 0.0, "completion_length": 305.359375, "epoch": 1.146484375, "grad_norm": 2.3548131843986604, "kl": 0.06591796875, "learning_rate": 7.135009765624999e-07, "loss": 0.0026, "reward": 1.7099797129631042, "reward_std": 0.1034369133412838, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7177922427654266, "step": 2347 }, { "clip_ratio": 0.0, "completion_length": 339.3984375, "epoch": 1.14697265625, "grad_norm": 1.4519222766859223, "kl": 0.0609130859375, "learning_rate": 7.133789062499999e-07, "loss": 0.0024, "reward": 1.7761664390563965, "reward_std": 0.05691366642713547, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7761664390563965, "step": 2348 }, { "clip_ratio": 0.0, "completion_length": 305.1015625, "epoch": 1.1474609375, "grad_norm": 2.4476278864349985, "kl": 0.0703125, "learning_rate": 7.132568359375e-07, "loss": 0.0028, "reward": 1.6659197211265564, "reward_std": 0.09462928026914597, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6659197509288788, "step": 2349 }, { "clip_ratio": 0.0, "completion_length": 390.3515625, "epoch": 1.14794921875, "grad_norm": 17.700669961292856, "kl": 0.05859375, "learning_rate": 7.13134765625e-07, "loss": 0.0023, "reward": 1.6661089062690735, "reward_std": 0.1225300058722496, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6895464062690735, "step": 2350 }, { "clip_ratio": 0.0, "completion_length": 298.7265625, "epoch": 1.1484375, "grad_norm": 1.5194236550305051, "kl": 0.0618896484375, "learning_rate": 7.130126953125e-07, "loss": 0.0025, "reward": 1.8306081295013428, "reward_std": 0.0984015129506588, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8384206891059875, "step": 2351 }, { "clip_ratio": 0.0, "completion_length": 252.96875, "epoch": 1.14892578125, "grad_norm": 4.927122085791297, "kl": 0.055908203125, "learning_rate": 7.12890625e-07, "loss": 0.0022, "reward": 1.7711811065673828, "reward_std": 0.0485275574028492, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7711811363697052, "step": 2352 }, { "clip_ratio": 0.0, "completion_length": 336.6328125, "epoch": 1.1494140625, "grad_norm": 1.2056839611949453, "kl": 0.0712890625, "learning_rate": 7.127685546875e-07, "loss": 0.0028, "reward": 1.8243648409843445, "reward_std": 0.027884284034371376, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8243648409843445, "step": 2353 }, { "clip_ratio": 0.0, "completion_length": 244.109375, "epoch": 1.14990234375, "grad_norm": 2.287808978760034, "kl": 0.075927734375, "learning_rate": 7.126464843749999e-07, "loss": 0.003, "reward": 1.8395601511001587, "reward_std": 0.017794081941246986, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8395601511001587, "step": 2354 }, { "clip_ratio": 0.0, "completion_length": 302.375, "epoch": 1.150390625, "grad_norm": 3.6128285986688082, "kl": 0.068603515625, "learning_rate": 7.125244140624999e-07, "loss": 0.0027, "reward": 1.7480557560920715, "reward_std": 0.11520683020353317, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7636808156967163, "step": 2355 }, { "clip_ratio": 0.0, "completion_length": 250.0234375, "epoch": 1.15087890625, "grad_norm": 1.1324473137402855, "kl": 0.04541015625, "learning_rate": 7.1240234375e-07, "loss": 0.0018, "reward": 1.7983075976371765, "reward_std": 0.07985487952828407, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7983075678348541, "step": 2356 }, { "clip_ratio": 0.0, "completion_length": 151.546875, "epoch": 1.1513671875, "grad_norm": 1.8359334191454069, "kl": 0.08544921875, "learning_rate": 7.122802734375e-07, "loss": 0.0034, "reward": 1.6366276741027832, "reward_std": 0.029252098873257637, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6366276144981384, "step": 2357 }, { "clip_ratio": 0.0, "completion_length": 312.4296875, "epoch": 1.15185546875, "grad_norm": 4.262206911923877, "kl": 0.0587158203125, "learning_rate": 7.12158203125e-07, "loss": 0.0023, "reward": 1.7909355163574219, "reward_std": 0.05595472827553749, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7909355759620667, "step": 2358 }, { "clip_ratio": 0.0, "completion_length": 373.3515625, "epoch": 1.15234375, "grad_norm": 7.9709277642475715, "kl": 0.0582275390625, "learning_rate": 7.120361328125e-07, "loss": 0.0023, "reward": 1.8145057559013367, "reward_std": 0.11433425173163414, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8223182857036591, "step": 2359 }, { "clip_ratio": 0.0, "completion_length": 410.15625, "epoch": 1.15283203125, "grad_norm": 1.3114368895833535, "kl": 0.053955078125, "learning_rate": 7.119140624999999e-07, "loss": 0.0022, "reward": 1.6761323809623718, "reward_std": 0.1001717671751976, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6995699405670166, "step": 2360 }, { "clip_ratio": 0.0, "completion_length": 357.5390625, "epoch": 1.1533203125, "grad_norm": 1.868219050538953, "kl": 0.0592041015625, "learning_rate": 7.117919921874999e-07, "loss": 0.0024, "reward": 1.7569094896316528, "reward_std": 0.11767644435167313, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7647219896316528, "step": 2361 }, { "clip_ratio": 0.0, "completion_length": 387.0234375, "epoch": 1.15380859375, "grad_norm": 1.2672827889776317, "kl": 0.0457763671875, "learning_rate": 7.11669921875e-07, "loss": 0.0018, "reward": 1.7769032716751099, "reward_std": 0.06862462218850851, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7769032120704651, "step": 2362 }, { "clip_ratio": 0.0, "completion_length": 374.6015625, "epoch": 1.154296875, "grad_norm": 1.2955402321867606, "kl": 0.0517578125, "learning_rate": 7.115478515625e-07, "loss": 0.0021, "reward": 1.7472400069236755, "reward_std": 0.1712198220193386, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7784900069236755, "step": 2363 }, { "clip_ratio": 0.0, "completion_length": 336.0546875, "epoch": 1.15478515625, "grad_norm": 2.2402729174427516, "kl": 0.06640625, "learning_rate": 7.1142578125e-07, "loss": 0.0026, "reward": 1.7722741961479187, "reward_std": 0.1572416089475155, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7878992259502411, "step": 2364 }, { "clip_ratio": 0.0, "completion_length": 262.7890625, "epoch": 1.1552734375, "grad_norm": 1.0208911322785654, "kl": 0.066650390625, "learning_rate": 7.113037109375e-07, "loss": 0.0027, "reward": 1.755761444568634, "reward_std": 0.07006818428635597, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7557614147663116, "step": 2365 }, { "clip_ratio": 0.0, "completion_length": 212.25, "epoch": 1.15576171875, "grad_norm": 2.065142149687345, "kl": 0.063720703125, "learning_rate": 7.11181640625e-07, "loss": 0.0025, "reward": 1.7702951431274414, "reward_std": 0.03799489140510559, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7781076431274414, "step": 2366 }, { "clip_ratio": 0.0, "completion_length": 283.5234375, "epoch": 1.15625, "grad_norm": 1.9266816192587584, "kl": 0.067138671875, "learning_rate": 7.110595703124999e-07, "loss": 0.0027, "reward": 1.733154058456421, "reward_std": 0.04852524399757385, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7331540882587433, "step": 2367 }, { "clip_ratio": 0.0, "completion_length": 313.8828125, "epoch": 1.15673828125, "grad_norm": 1.268819129630388, "kl": 0.076416015625, "learning_rate": 7.109374999999999e-07, "loss": 0.0031, "reward": 1.727283000946045, "reward_std": 0.04405433498322964, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7272829711437225, "step": 2368 }, { "clip_ratio": 0.0, "completion_length": 236.796875, "epoch": 1.1572265625, "grad_norm": 1.3015855264210114, "kl": 0.056396484375, "learning_rate": 7.108154296875e-07, "loss": 0.0023, "reward": 1.7930294871330261, "reward_std": 0.056975074112415314, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7930294573307037, "step": 2369 }, { "clip_ratio": 0.0, "completion_length": 280.6796875, "epoch": 1.15771484375, "grad_norm": 1.4361230941581011, "kl": 0.0504150390625, "learning_rate": 7.10693359375e-07, "loss": 0.002, "reward": 1.783621370792389, "reward_std": 0.04106577858328819, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7836213707923889, "step": 2370 }, { "clip_ratio": 0.0, "completion_length": 303.4140625, "epoch": 1.158203125, "grad_norm": 1.375569448092502, "kl": 0.0589599609375, "learning_rate": 7.105712890625e-07, "loss": 0.0024, "reward": 1.7498807311058044, "reward_std": 0.06763119343668222, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7576932013034821, "step": 2371 }, { "clip_ratio": 0.0, "completion_length": 250.375, "epoch": 1.15869140625, "grad_norm": 0.5023786129439751, "kl": 0.05517578125, "learning_rate": 7.1044921875e-07, "loss": 0.0022, "reward": 1.8890994787216187, "reward_std": 0.020692605525255203, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8890994787216187, "step": 2372 }, { "clip_ratio": 0.0, "completion_length": 256.3984375, "epoch": 1.1591796875, "grad_norm": 1.7393139909940287, "kl": 0.081298828125, "learning_rate": 7.103271484374999e-07, "loss": 0.0032, "reward": 1.7087842226028442, "reward_std": 0.033358908258378506, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7087842524051666, "step": 2373 }, { "clip_ratio": 0.0, "completion_length": 350.7734375, "epoch": 1.15966796875, "grad_norm": 1.628089694830596, "kl": 0.0567626953125, "learning_rate": 7.102050781249999e-07, "loss": 0.0023, "reward": 1.7870800495147705, "reward_std": 0.04601499065756798, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7870800197124481, "step": 2374 }, { "clip_ratio": 0.0, "completion_length": 290.046875, "epoch": 1.16015625, "grad_norm": 1.0821890704171715, "kl": 0.0506591796875, "learning_rate": 7.100830078125e-07, "loss": 0.002, "reward": 1.8657442927360535, "reward_std": 0.10163949802517891, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8735567629337311, "step": 2375 }, { "clip_ratio": 0.0, "completion_length": 298.375, "epoch": 1.16064453125, "grad_norm": 1.5005134329516172, "kl": 0.0582275390625, "learning_rate": 7.099609375e-07, "loss": 0.0023, "reward": 1.7599137425422668, "reward_std": 0.05832270160317421, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7599137425422668, "step": 2376 }, { "clip_ratio": 0.0, "completion_length": 318.4453125, "epoch": 1.1611328125, "grad_norm": 1.1183281293568572, "kl": 0.0633544921875, "learning_rate": 7.098388671875e-07, "loss": 0.0025, "reward": 1.8047245144844055, "reward_std": 0.08250847831368446, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8047245144844055, "step": 2377 }, { "clip_ratio": 0.0, "completion_length": 249.28125, "epoch": 1.16162109375, "grad_norm": 1.5202076197640586, "kl": 0.056396484375, "learning_rate": 7.09716796875e-07, "loss": 0.0023, "reward": 1.9078629612922668, "reward_std": 0.029499279335141182, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9078629016876221, "step": 2378 }, { "clip_ratio": 0.0, "completion_length": 249.4296875, "epoch": 1.162109375, "grad_norm": 1.806142323321523, "kl": 0.0577392578125, "learning_rate": 7.095947265625e-07, "loss": 0.0023, "reward": 1.755949079990387, "reward_std": 0.09004699625074863, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.755949079990387, "step": 2379 }, { "clip_ratio": 0.0, "completion_length": 240.3359375, "epoch": 1.16259765625, "grad_norm": 2.5409784656610634, "kl": 0.06298828125, "learning_rate": 7.094726562499999e-07, "loss": 0.0025, "reward": 1.785530149936676, "reward_std": 0.046783702448010445, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7855301201343536, "step": 2380 }, { "clip_ratio": 0.0, "completion_length": 421.84375, "epoch": 1.1630859375, "grad_norm": 1.5410866702591242, "kl": 0.0582275390625, "learning_rate": 7.093505859374999e-07, "loss": 0.0023, "reward": 1.6306228041648865, "reward_std": 0.10624398104846478, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6462478041648865, "step": 2381 }, { "clip_ratio": 0.0, "completion_length": 342.1328125, "epoch": 1.16357421875, "grad_norm": 1.5076095428158183, "kl": 0.0550537109375, "learning_rate": 7.09228515625e-07, "loss": 0.0022, "reward": 1.8531184792518616, "reward_std": 0.09160671941936016, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.860931009054184, "step": 2382 }, { "clip_ratio": 0.0, "completion_length": 410.921875, "epoch": 1.1640625, "grad_norm": 2.1539995428471905, "kl": 0.070068359375, "learning_rate": 7.091064453125e-07, "loss": 0.0028, "reward": 1.7696999311447144, "reward_std": 0.10432733595371246, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7853248119354248, "step": 2383 }, { "clip_ratio": 0.0, "completion_length": 312.0, "epoch": 1.16455078125, "grad_norm": 2.777834781481418, "kl": 0.05322265625, "learning_rate": 7.08984375e-07, "loss": 0.0021, "reward": 1.7855026125907898, "reward_std": 0.08844604343175888, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7933151125907898, "step": 2384 }, { "clip_ratio": 0.0, "completion_length": 285.21875, "epoch": 1.1650390625, "grad_norm": 2.3212115933760553, "kl": 0.072021484375, "learning_rate": 7.088623046875e-07, "loss": 0.0029, "reward": 1.658606767654419, "reward_std": 0.13669633120298386, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6742317080497742, "step": 2385 }, { "clip_ratio": 0.0, "completion_length": 369.03125, "epoch": 1.16552734375, "grad_norm": 1.5471087574001265, "kl": 0.061279296875, "learning_rate": 7.087402343749999e-07, "loss": 0.0024, "reward": 1.7448172569274902, "reward_std": 0.20727698504924774, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.783879816532135, "step": 2386 }, { "clip_ratio": 0.0, "completion_length": 267.953125, "epoch": 1.166015625, "grad_norm": 3.4097330170492905, "kl": 0.0640869140625, "learning_rate": 7.086181640624999e-07, "loss": 0.0026, "reward": 1.765123426914215, "reward_std": 0.05697265453636646, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7651234269142151, "step": 2387 }, { "clip_ratio": 0.0, "completion_length": 272.265625, "epoch": 1.16650390625, "grad_norm": 2.5094396166271236, "kl": 0.058837890625, "learning_rate": 7.0849609375e-07, "loss": 0.0024, "reward": 1.8069196343421936, "reward_std": 0.10948172211647034, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8147320747375488, "step": 2388 }, { "clip_ratio": 0.0, "completion_length": 382.0703125, "epoch": 1.1669921875, "grad_norm": 1.6288146794380969, "kl": 0.073974609375, "learning_rate": 7.083740234375e-07, "loss": 0.003, "reward": 1.7542518377304077, "reward_std": 0.045296634547412395, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7542518377304077, "step": 2389 }, { "clip_ratio": 0.0, "completion_length": 372.3828125, "epoch": 1.16748046875, "grad_norm": 1.5399608721774982, "kl": 0.0509033203125, "learning_rate": 7.08251953125e-07, "loss": 0.002, "reward": 1.70972341299057, "reward_std": 0.16795263439416885, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7331609427928925, "step": 2390 }, { "clip_ratio": 0.0, "completion_length": 310.4375, "epoch": 1.16796875, "grad_norm": 2.2999725959208788, "kl": 0.0594482421875, "learning_rate": 7.081298828125e-07, "loss": 0.0024, "reward": 1.8128122091293335, "reward_std": 0.09195205383002758, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8128121495246887, "step": 2391 }, { "clip_ratio": 0.0, "completion_length": 218.0859375, "epoch": 1.16845703125, "grad_norm": 1.940816342699902, "kl": 0.068359375, "learning_rate": 7.080078125e-07, "loss": 0.0027, "reward": 1.747936189174652, "reward_std": 0.08866530656814575, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7479361891746521, "step": 2392 }, { "clip_ratio": 0.0, "completion_length": 323.6640625, "epoch": 1.1689453125, "grad_norm": 1.765314724522433, "kl": 0.052001953125, "learning_rate": 7.078857421874999e-07, "loss": 0.0021, "reward": 1.709149956703186, "reward_std": 0.06337589770555496, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.709149956703186, "step": 2393 }, { "clip_ratio": 0.0, "completion_length": 230.6015625, "epoch": 1.16943359375, "grad_norm": 7.062752985549045, "kl": 0.07958984375, "learning_rate": 7.077636718749999e-07, "loss": 0.0032, "reward": 1.6336697340011597, "reward_std": 0.1141487006098032, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6414822340011597, "step": 2394 }, { "clip_ratio": 0.0, "completion_length": 266.6640625, "epoch": 1.169921875, "grad_norm": 1.0620017021682755, "kl": 0.049560546875, "learning_rate": 7.076416015625e-07, "loss": 0.002, "reward": 1.9040113687515259, "reward_std": 0.05147293955087662, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.904011458158493, "step": 2395 }, { "clip_ratio": 0.0, "completion_length": 215.3125, "epoch": 1.17041015625, "grad_norm": 1.3276612147735944, "kl": 0.075927734375, "learning_rate": 7.0751953125e-07, "loss": 0.003, "reward": 1.7077276706695557, "reward_std": 0.0732121616601944, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7077276408672333, "step": 2396 }, { "clip_ratio": 0.0, "completion_length": 247.671875, "epoch": 1.1708984375, "grad_norm": 1.4011487524978836, "kl": 0.079833984375, "learning_rate": 7.073974609375e-07, "loss": 0.0032, "reward": 1.7469829320907593, "reward_std": 0.0623103235848248, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7469829320907593, "step": 2397 }, { "clip_ratio": 0.0, "completion_length": 275.90625, "epoch": 1.17138671875, "grad_norm": 2.1429163994559337, "kl": 0.0740966796875, "learning_rate": 7.07275390625e-07, "loss": 0.003, "reward": 1.8036177158355713, "reward_std": 0.07194317691028118, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8036176562309265, "step": 2398 }, { "clip_ratio": 0.0, "completion_length": 287.75, "epoch": 1.171875, "grad_norm": 1.7405554380960897, "kl": 0.066162109375, "learning_rate": 7.071533203124999e-07, "loss": 0.0026, "reward": 1.6592023372650146, "reward_std": 0.08969663083553314, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6592022776603699, "step": 2399 }, { "clip_ratio": 0.0, "completion_length": 220.328125, "epoch": 1.17236328125, "grad_norm": 1.2593548286372285, "kl": 0.068115234375, "learning_rate": 7.070312499999999e-07, "loss": 0.0027, "reward": 1.7829896211624146, "reward_std": 0.0856513325124979, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7829896509647369, "step": 2400 }, { "clip_ratio": 0.0, "completion_length": 277.2109375, "epoch": 1.1728515625, "grad_norm": 3.1176602902157495, "kl": 0.0859375, "learning_rate": 7.069091796875e-07, "loss": 0.0034, "reward": 1.8581604957580566, "reward_std": 0.10554312914609909, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.858160525560379, "step": 2401 }, { "clip_ratio": 0.0, "completion_length": 169.625, "epoch": 1.17333984375, "grad_norm": 1.3117450441494156, "kl": 0.082763671875, "learning_rate": 7.06787109375e-07, "loss": 0.0033, "reward": 1.706332802772522, "reward_std": 0.04083455912768841, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.706332802772522, "step": 2402 }, { "clip_ratio": 0.0, "completion_length": 278.4140625, "epoch": 1.173828125, "grad_norm": 3.430690325991213, "kl": 0.13623046875, "learning_rate": 7.066650390625e-07, "loss": 0.0055, "reward": 1.6918965578079224, "reward_std": 0.06476756557822227, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6918965578079224, "step": 2403 }, { "clip_ratio": 0.0, "completion_length": 279.0, "epoch": 1.17431640625, "grad_norm": 2.062517429740279, "kl": 0.072509765625, "learning_rate": 7.0654296875e-07, "loss": 0.0029, "reward": 1.6687769293785095, "reward_std": 0.0822465568780899, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6687769889831543, "step": 2404 }, { "clip_ratio": 0.0, "completion_length": 287.5390625, "epoch": 1.1748046875, "grad_norm": 1.479713578737759, "kl": 0.065673828125, "learning_rate": 7.064208984375e-07, "loss": 0.0026, "reward": 1.843060851097107, "reward_std": 0.05525344889611006, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8430608510971069, "step": 2405 }, { "clip_ratio": 0.0, "completion_length": 265.21875, "epoch": 1.17529296875, "grad_norm": 5.418876184032981, "kl": 0.0703125, "learning_rate": 7.062988281249999e-07, "loss": 0.0028, "reward": 1.7128131985664368, "reward_std": 0.0804828368127346, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7128131687641144, "step": 2406 }, { "clip_ratio": 0.0, "completion_length": 231.625, "epoch": 1.17578125, "grad_norm": 2.7306643847833683, "kl": 0.08154296875, "learning_rate": 7.061767578124999e-07, "loss": 0.0033, "reward": 1.84114408493042, "reward_std": 0.047078766860067844, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8411440849304199, "step": 2407 }, { "clip_ratio": 0.0, "completion_length": 257.4296875, "epoch": 1.17626953125, "grad_norm": 0.959063992930826, "kl": 0.0633544921875, "learning_rate": 7.060546875e-07, "loss": 0.0025, "reward": 1.8835274577140808, "reward_std": 0.06786072719842196, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8913399577140808, "step": 2408 }, { "clip_ratio": 0.0, "completion_length": 214.578125, "epoch": 1.1767578125, "grad_norm": 0.8022740950274427, "kl": 0.082275390625, "learning_rate": 7.059326171875e-07, "loss": 0.0033, "reward": 1.8769137263298035, "reward_std": 0.09892814233899117, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8847261667251587, "step": 2409 }, { "clip_ratio": 0.0, "completion_length": 250.8125, "epoch": 1.17724609375, "grad_norm": 1.4812068400181597, "kl": 0.0830078125, "learning_rate": 7.05810546875e-07, "loss": 0.0033, "reward": 1.8106178045272827, "reward_std": 0.13285555690526962, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8262427449226379, "step": 2410 }, { "clip_ratio": 0.0, "completion_length": 321.421875, "epoch": 1.177734375, "grad_norm": 4.6584927782887195, "kl": 0.06884765625, "learning_rate": 7.056884765625e-07, "loss": 0.0028, "reward": 1.6145755648612976, "reward_std": 0.12683077156543732, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.62238809466362, "step": 2411 }, { "clip_ratio": 0.0, "completion_length": 262.5859375, "epoch": 1.17822265625, "grad_norm": 1.3273325229855206, "kl": 0.0693359375, "learning_rate": 7.055664062499999e-07, "loss": 0.0028, "reward": 1.8337931036949158, "reward_std": 0.06046081706881523, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8337931036949158, "step": 2412 }, { "clip_ratio": 0.0, "completion_length": 271.2265625, "epoch": 1.1787109375, "grad_norm": 1.1873786934092632, "kl": 0.062744140625, "learning_rate": 7.054443359374999e-07, "loss": 0.0025, "reward": 1.8494738936424255, "reward_std": 0.07496330887079239, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8494738638401031, "step": 2413 }, { "clip_ratio": 0.0, "completion_length": 291.4375, "epoch": 1.17919921875, "grad_norm": 1.2850630040194209, "kl": 0.0634765625, "learning_rate": 7.05322265625e-07, "loss": 0.0025, "reward": 1.740599811077118, "reward_std": 0.04363143816590309, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7405998110771179, "step": 2414 }, { "clip_ratio": 0.0, "completion_length": 234.8359375, "epoch": 1.1796875, "grad_norm": 2.6529425377978475, "kl": 0.07861328125, "learning_rate": 7.052001953125e-07, "loss": 0.0032, "reward": 1.8151870369911194, "reward_std": 0.13191955909132957, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.822999507188797, "step": 2415 }, { "clip_ratio": 0.0, "completion_length": 211.4921875, "epoch": 1.18017578125, "grad_norm": 2.380781065882567, "kl": 0.073974609375, "learning_rate": 7.05078125e-07, "loss": 0.003, "reward": 1.8084670305252075, "reward_std": 0.05959512945264578, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8084670305252075, "step": 2416 }, { "clip_ratio": 0.0, "completion_length": 302.46875, "epoch": 1.1806640625, "grad_norm": 1.633451395929759, "kl": 0.0732421875, "learning_rate": 7.049560546875e-07, "loss": 0.0029, "reward": 1.8302597403526306, "reward_std": 0.10631529986858368, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8458847999572754, "step": 2417 }, { "clip_ratio": 0.0, "completion_length": 269.8984375, "epoch": 1.18115234375, "grad_norm": 1.095996447538376, "kl": 0.065673828125, "learning_rate": 7.04833984375e-07, "loss": 0.0026, "reward": 1.6789276599884033, "reward_std": 0.07097472064197063, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6867401003837585, "step": 2418 }, { "clip_ratio": 0.0, "completion_length": 279.4375, "epoch": 1.181640625, "grad_norm": 0.6358463267113961, "kl": 0.0692138671875, "learning_rate": 7.047119140624999e-07, "loss": 0.0028, "reward": 1.7676212787628174, "reward_std": 0.02877889759838581, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.767621248960495, "step": 2419 }, { "clip_ratio": 0.0, "completion_length": 251.0546875, "epoch": 1.18212890625, "grad_norm": 1.7201953005283817, "kl": 0.06591796875, "learning_rate": 7.045898437499999e-07, "loss": 0.0026, "reward": 1.6933047771453857, "reward_std": 0.11904028803110123, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7089297771453857, "step": 2420 }, { "clip_ratio": 0.0, "completion_length": 317.84375, "epoch": 1.1826171875, "grad_norm": 1.642795231335406, "kl": 0.076171875, "learning_rate": 7.044677734375e-07, "loss": 0.003, "reward": 1.7209062576293945, "reward_std": 0.08078465051949024, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7287188470363617, "step": 2421 }, { "clip_ratio": 0.0, "completion_length": 264.390625, "epoch": 1.18310546875, "grad_norm": 1.697016218007385, "kl": 0.07861328125, "learning_rate": 7.04345703125e-07, "loss": 0.0031, "reward": 1.7383880019187927, "reward_std": 0.033076136372983456, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7383880317211151, "step": 2422 }, { "clip_ratio": 0.0, "completion_length": 283.96875, "epoch": 1.18359375, "grad_norm": 2.3807287354436486, "kl": 0.0628662109375, "learning_rate": 7.042236328125e-07, "loss": 0.0025, "reward": 1.785739779472351, "reward_std": 0.0659907665103674, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7857397794723511, "step": 2423 }, { "clip_ratio": 0.0, "completion_length": 303.921875, "epoch": 1.18408203125, "grad_norm": 1.850170065076833, "kl": 0.077880859375, "learning_rate": 7.041015625e-07, "loss": 0.0031, "reward": 1.8572614789009094, "reward_std": 0.03198802284896374, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8572614789009094, "step": 2424 }, { "clip_ratio": 0.0, "completion_length": 301.2578125, "epoch": 1.1845703125, "grad_norm": 1.250303267080643, "kl": 0.0650634765625, "learning_rate": 7.039794921874999e-07, "loss": 0.0026, "reward": 1.6902012825012207, "reward_std": 0.04371343832463026, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6902012228965759, "step": 2425 }, { "clip_ratio": 0.0, "completion_length": 358.25, "epoch": 1.18505859375, "grad_norm": 5.468487040253603, "kl": 0.0849609375, "learning_rate": 7.038574218749999e-07, "loss": 0.0034, "reward": 1.557692527770996, "reward_std": 0.11165288090705872, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.6280049979686737, "step": 2426 }, { "clip_ratio": 0.0, "completion_length": 258.5546875, "epoch": 1.185546875, "grad_norm": 1.456806917925311, "kl": 0.066650390625, "learning_rate": 7.037353515625e-07, "loss": 0.0027, "reward": 1.66695636510849, "reward_std": 0.03848722204566002, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6669564247131348, "step": 2427 }, { "clip_ratio": 0.0, "completion_length": 331.078125, "epoch": 1.18603515625, "grad_norm": 2.9368301614032495, "kl": 0.0667724609375, "learning_rate": 7.0361328125e-07, "loss": 0.0027, "reward": 1.7350419759750366, "reward_std": 0.09676255099475384, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7584794461727142, "step": 2428 }, { "clip_ratio": 0.0, "completion_length": 292.4375, "epoch": 1.1865234375, "grad_norm": 2.219443545795303, "kl": 0.0638427734375, "learning_rate": 7.034912109375e-07, "loss": 0.0026, "reward": 1.8254042863845825, "reward_std": 0.0871292520314455, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8254042863845825, "step": 2429 }, { "clip_ratio": 0.0, "completion_length": 241.7265625, "epoch": 1.18701171875, "grad_norm": 1.0660266946811585, "kl": 0.079345703125, "learning_rate": 7.03369140625e-07, "loss": 0.0032, "reward": 1.7727646231651306, "reward_std": 0.06067582964897156, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.772764652967453, "step": 2430 }, { "clip_ratio": 0.0, "completion_length": 309.8515625, "epoch": 1.1875, "grad_norm": 2.0455673981048337, "kl": 0.069091796875, "learning_rate": 7.032470703125e-07, "loss": 0.0028, "reward": 1.6600714921951294, "reward_std": 0.054243333637714386, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6600715816020966, "step": 2431 }, { "clip_ratio": 0.0, "completion_length": 320.046875, "epoch": 1.18798828125, "grad_norm": 2.3469567235126343, "kl": 0.074951171875, "learning_rate": 7.031249999999999e-07, "loss": 0.003, "reward": 1.717573642730713, "reward_std": 0.14434907957911491, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7410111129283905, "step": 2432 }, { "clip_ratio": 0.0, "completion_length": 334.8828125, "epoch": 1.1884765625, "grad_norm": 1.7159405891568082, "kl": 0.08154296875, "learning_rate": 7.030029296874999e-07, "loss": 0.0033, "reward": 1.7161504030227661, "reward_std": 0.03489119280129671, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7161504626274109, "step": 2433 }, { "clip_ratio": 0.0, "completion_length": 319.6171875, "epoch": 1.18896484375, "grad_norm": 2.209863344385722, "kl": 0.065185546875, "learning_rate": 7.02880859375e-07, "loss": 0.0026, "reward": 1.7833570837974548, "reward_std": 0.06764233857393265, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7833570539951324, "step": 2434 }, { "clip_ratio": 0.0, "completion_length": 303.4765625, "epoch": 1.189453125, "grad_norm": 5.044233806012345, "kl": 0.05859375, "learning_rate": 7.027587890625e-07, "loss": 0.0023, "reward": 1.771048367023468, "reward_std": 0.033720131730660796, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.771048367023468, "step": 2435 }, { "clip_ratio": 0.0, "completion_length": 304.2265625, "epoch": 1.18994140625, "grad_norm": 2.5922434578602305, "kl": 0.07177734375, "learning_rate": 7.0263671875e-07, "loss": 0.0029, "reward": 1.6098366379737854, "reward_std": 0.05740887112915516, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.609836757183075, "step": 2436 }, { "clip_ratio": 0.0, "completion_length": 280.0078125, "epoch": 1.1904296875, "grad_norm": 1.8713184275572499, "kl": 0.080078125, "learning_rate": 7.025146484375e-07, "loss": 0.0032, "reward": 1.7736141681671143, "reward_std": 0.04480298818089068, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7736141979694366, "step": 2437 }, { "clip_ratio": 0.0, "completion_length": 362.2265625, "epoch": 1.19091796875, "grad_norm": 2.1515702226767512, "kl": 0.0732421875, "learning_rate": 7.02392578125e-07, "loss": 0.0029, "reward": 1.695095181465149, "reward_std": 0.12620120495557785, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7263452112674713, "step": 2438 }, { "clip_ratio": 0.0, "completion_length": 336.546875, "epoch": 1.19140625, "grad_norm": 2.5367504438666413, "kl": 0.0552978515625, "learning_rate": 7.022705078124999e-07, "loss": 0.0022, "reward": 1.7602566480636597, "reward_std": 0.11810046620666981, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.8149441182613373, "step": 2439 }, { "clip_ratio": 0.0, "completion_length": 271.3984375, "epoch": 1.19189453125, "grad_norm": 1.6103282302386244, "kl": 0.0760498046875, "learning_rate": 7.021484375e-07, "loss": 0.003, "reward": 1.5700552463531494, "reward_std": 0.09616255201399326, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5856801867485046, "step": 2440 }, { "clip_ratio": 0.0, "completion_length": 298.2890625, "epoch": 1.1923828125, "grad_norm": 2.776703466020179, "kl": 0.055908203125, "learning_rate": 7.020263671875e-07, "loss": 0.0022, "reward": 1.7921187281608582, "reward_std": 0.04691682942211628, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.792118638753891, "step": 2441 }, { "clip_ratio": 0.0, "completion_length": 434.8828125, "epoch": 1.19287109375, "grad_norm": 2.8510129285591836, "kl": 0.0478515625, "learning_rate": 7.01904296875e-07, "loss": 0.0019, "reward": 1.6040194630622864, "reward_std": 0.1493111103773117, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.650894433259964, "step": 2442 }, { "clip_ratio": 0.0, "completion_length": 352.8359375, "epoch": 1.193359375, "grad_norm": 0.9498794503148597, "kl": 0.068603515625, "learning_rate": 7.017822265625e-07, "loss": 0.0027, "reward": 1.7161058187484741, "reward_std": 0.09053925797343254, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7551683783531189, "step": 2443 }, { "clip_ratio": 0.0, "completion_length": 264.7734375, "epoch": 1.19384765625, "grad_norm": 1.1199023458800679, "kl": 0.061279296875, "learning_rate": 7.0166015625e-07, "loss": 0.0025, "reward": 1.8098965287208557, "reward_std": 0.08372041955590248, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8177090287208557, "step": 2444 }, { "clip_ratio": 0.0, "completion_length": 383.6015625, "epoch": 1.1943359375, "grad_norm": 0.47829963771188727, "kl": 0.0491943359375, "learning_rate": 7.015380859374999e-07, "loss": 0.002, "reward": 1.6234931945800781, "reward_std": 0.18474455177783966, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.6938056945800781, "step": 2445 }, { "clip_ratio": 0.0, "completion_length": 312.484375, "epoch": 1.19482421875, "grad_norm": 1.6276283737432735, "kl": 0.0614013671875, "learning_rate": 7.014160156249999e-07, "loss": 0.0025, "reward": 1.7953330874443054, "reward_std": 0.08542214334011078, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8109579682350159, "step": 2446 }, { "clip_ratio": 0.0, "completion_length": 340.765625, "epoch": 1.1953125, "grad_norm": 2.960283467365649, "kl": 0.070068359375, "learning_rate": 7.012939453125e-07, "loss": 0.0028, "reward": 1.616200864315033, "reward_std": 0.16303523629903793, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.678700864315033, "step": 2447 }, { "clip_ratio": 0.0, "completion_length": 290.4296875, "epoch": 1.19580078125, "grad_norm": 1.8931874672712756, "kl": 0.070556640625, "learning_rate": 7.01171875e-07, "loss": 0.0028, "reward": 1.7042686939239502, "reward_std": 0.10324783250689507, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.712081253528595, "step": 2448 }, { "clip_ratio": 0.0, "completion_length": 365.7890625, "epoch": 1.1962890625, "grad_norm": 0.8846851645218154, "kl": 0.045654296875, "learning_rate": 7.010498046875e-07, "loss": 0.0018, "reward": 1.826434314250946, "reward_std": 0.16191211715340614, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8576842248439789, "step": 2449 }, { "clip_ratio": 0.0, "completion_length": 227.3046875, "epoch": 1.19677734375, "grad_norm": 0.7349892567727097, "kl": 0.0633544921875, "learning_rate": 7.00927734375e-07, "loss": 0.0025, "reward": 1.8562174439430237, "reward_std": 0.01508009573444724, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8562174439430237, "step": 2450 }, { "clip_ratio": 0.0, "completion_length": 432.4375, "epoch": 1.197265625, "grad_norm": 2.0795643682176217, "kl": 0.07666015625, "learning_rate": 7.008056640625e-07, "loss": 0.0031, "reward": 1.5798554420471191, "reward_std": 0.09327048435807228, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6189179420471191, "step": 2451 }, { "clip_ratio": 0.0, "completion_length": 352.4296875, "epoch": 1.19775390625, "grad_norm": 2.43960100948424, "kl": 0.0667724609375, "learning_rate": 7.006835937499999e-07, "loss": 0.0027, "reward": 1.6752365827560425, "reward_std": 0.16312190517783165, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6986740827560425, "step": 2452 }, { "clip_ratio": 0.0, "completion_length": 426.6953125, "epoch": 1.1982421875, "grad_norm": 1.783527107114228, "kl": 0.0546875, "learning_rate": 7.005615234374999e-07, "loss": 0.0022, "reward": 1.697092890739441, "reward_std": 0.12336409464478493, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7127178907394409, "step": 2453 }, { "clip_ratio": 0.0, "completion_length": 352.796875, "epoch": 1.19873046875, "grad_norm": 1.4386375964027276, "kl": 0.059814453125, "learning_rate": 7.00439453125e-07, "loss": 0.0024, "reward": 1.8195868134498596, "reward_std": 0.034168762154877186, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.819586843252182, "step": 2454 }, { "clip_ratio": 0.0, "completion_length": 308.0078125, "epoch": 1.19921875, "grad_norm": 1.0233839801880174, "kl": 0.0572509765625, "learning_rate": 7.003173828125e-07, "loss": 0.0023, "reward": 1.8272386193275452, "reward_std": 0.05041295662522316, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8272385895252228, "step": 2455 }, { "clip_ratio": 0.0, "completion_length": 291.9453125, "epoch": 1.19970703125, "grad_norm": 1.721124868164957, "kl": 0.0606689453125, "learning_rate": 7.001953125e-07, "loss": 0.0024, "reward": 1.6735165119171143, "reward_std": 0.0498051792383194, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6735165119171143, "step": 2456 }, { "clip_ratio": 0.0, "completion_length": 207.6640625, "epoch": 1.2001953125, "grad_norm": 4.63579264894847, "kl": 0.0703125, "learning_rate": 7.000732421875e-07, "loss": 0.0028, "reward": 1.8002795577049255, "reward_std": 0.04197111213579774, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8002796173095703, "step": 2457 }, { "clip_ratio": 0.0, "completion_length": 329.3359375, "epoch": 1.20068359375, "grad_norm": 1.546489313566168, "kl": 0.0576171875, "learning_rate": 6.999511718749999e-07, "loss": 0.0023, "reward": 1.721842348575592, "reward_std": 0.0830717384815216, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.721842348575592, "step": 2458 }, { "clip_ratio": 0.0, "completion_length": 274.6640625, "epoch": 1.201171875, "grad_norm": 5.094662458606525, "kl": 0.0626220703125, "learning_rate": 6.998291015624999e-07, "loss": 0.0025, "reward": 1.7491782903671265, "reward_std": 0.08964913338422775, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7491783201694489, "step": 2459 }, { "clip_ratio": 0.0, "completion_length": 379.1640625, "epoch": 1.20166015625, "grad_norm": 1.1949232897338642, "kl": 0.05615234375, "learning_rate": 6.9970703125e-07, "loss": 0.0022, "reward": 1.8746750950813293, "reward_std": 0.11067311465740204, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8824875354766846, "step": 2460 }, { "clip_ratio": 0.0, "completion_length": 244.4921875, "epoch": 1.2021484375, "grad_norm": 1.5899026672991512, "kl": 0.08447265625, "learning_rate": 6.995849609375e-07, "loss": 0.0034, "reward": 1.8029922246932983, "reward_std": 0.06205196492373943, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8029922544956207, "step": 2461 }, { "clip_ratio": 0.0, "completion_length": 300.1953125, "epoch": 1.20263671875, "grad_norm": 2.103963718527489, "kl": 0.0712890625, "learning_rate": 6.99462890625e-07, "loss": 0.0028, "reward": 1.7411906719207764, "reward_std": 0.07426265999674797, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7411905825138092, "step": 2462 }, { "clip_ratio": 0.0, "completion_length": 275.1484375, "epoch": 1.203125, "grad_norm": 1.2381185234846581, "kl": 0.0516357421875, "learning_rate": 6.993408203125e-07, "loss": 0.0021, "reward": 1.7142133712768555, "reward_std": 0.07610474899411201, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7220259010791779, "step": 2463 }, { "clip_ratio": 0.0, "completion_length": 202.40625, "epoch": 1.20361328125, "grad_norm": 6.431279471704583, "kl": 0.095703125, "learning_rate": 6.9921875e-07, "loss": 0.0038, "reward": 1.6048645973205566, "reward_std": 0.07959796488285065, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6048645377159119, "step": 2464 }, { "clip_ratio": 0.0, "completion_length": 236.3359375, "epoch": 1.2041015625, "grad_norm": 2.841010011660038, "kl": 0.068603515625, "learning_rate": 6.990966796874999e-07, "loss": 0.0027, "reward": 1.8495973944664001, "reward_std": 0.05448159575462341, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8495973944664001, "step": 2465 }, { "clip_ratio": 0.0, "completion_length": 260.8671875, "epoch": 1.20458984375, "grad_norm": 0.8842872987899542, "kl": 0.0574951171875, "learning_rate": 6.989746093749999e-07, "loss": 0.0023, "reward": 1.8488314151763916, "reward_std": 0.039236126467585564, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.848831444978714, "step": 2466 }, { "clip_ratio": 0.0, "completion_length": 353.640625, "epoch": 1.205078125, "grad_norm": 3.6653341295723876, "kl": 0.072265625, "learning_rate": 6.988525390625e-07, "loss": 0.0029, "reward": 1.7910266518592834, "reward_std": 0.05401626043021679, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7910265922546387, "step": 2467 }, { "clip_ratio": 0.0, "completion_length": 209.6796875, "epoch": 1.20556640625, "grad_norm": 1.4644528281755704, "kl": 0.078369140625, "learning_rate": 6.9873046875e-07, "loss": 0.0031, "reward": 1.8185259103775024, "reward_std": 0.01729111559689045, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8185259103775024, "step": 2468 }, { "clip_ratio": 0.0, "completion_length": 307.4453125, "epoch": 1.2060546875, "grad_norm": 0.8915286774826748, "kl": 0.072021484375, "learning_rate": 6.986083984375e-07, "loss": 0.0029, "reward": 1.7518900632858276, "reward_std": 0.0792790362611413, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7909526228904724, "step": 2469 }, { "clip_ratio": 0.0, "completion_length": 328.515625, "epoch": 1.20654296875, "grad_norm": 1.9404027572704816, "kl": 0.087158203125, "learning_rate": 6.98486328125e-07, "loss": 0.0035, "reward": 1.7426846027374268, "reward_std": 0.08018626365810633, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7426846027374268, "step": 2470 }, { "clip_ratio": 0.0, "completion_length": 289.0625, "epoch": 1.20703125, "grad_norm": 1.1767884723377213, "kl": 0.0731201171875, "learning_rate": 6.983642578124999e-07, "loss": 0.0029, "reward": 1.7661176919937134, "reward_std": 0.05781315267086029, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7661177515983582, "step": 2471 }, { "clip_ratio": 0.0, "completion_length": 287.7421875, "epoch": 1.20751953125, "grad_norm": 12.083794382746516, "kl": 0.07373046875, "learning_rate": 6.982421874999999e-07, "loss": 0.0029, "reward": 1.7419702410697937, "reward_std": 0.019140704069286585, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7419701814651489, "step": 2472 }, { "clip_ratio": 0.0, "completion_length": 247.8203125, "epoch": 1.2080078125, "grad_norm": 2.0703187627263686, "kl": 0.084228515625, "learning_rate": 6.981201171875e-07, "loss": 0.0034, "reward": 1.7072731852531433, "reward_std": 0.10907960124313831, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7150856554508209, "step": 2473 }, { "clip_ratio": 0.0, "completion_length": 231.4453125, "epoch": 1.20849609375, "grad_norm": 0.9422041926618386, "kl": 0.0816650390625, "learning_rate": 6.97998046875e-07, "loss": 0.0033, "reward": 1.7536945343017578, "reward_std": 0.02776573784649372, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.753694474697113, "step": 2474 }, { "clip_ratio": 0.0, "completion_length": 356.34375, "epoch": 1.208984375, "grad_norm": 0.8433697427169689, "kl": 0.0611572265625, "learning_rate": 6.978759765625e-07, "loss": 0.0024, "reward": 1.726251244544983, "reward_std": 0.06488487310707569, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7262513041496277, "step": 2475 }, { "clip_ratio": 0.0, "completion_length": 264.53125, "epoch": 1.20947265625, "grad_norm": 0.7248153544649567, "kl": 0.0712890625, "learning_rate": 6.9775390625e-07, "loss": 0.0029, "reward": 1.7210323810577393, "reward_std": 0.010402468382380903, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7210325002670288, "step": 2476 }, { "clip_ratio": 0.0, "completion_length": 312.3984375, "epoch": 1.2099609375, "grad_norm": 1.8834510853065414, "kl": 0.06201171875, "learning_rate": 6.976318359375e-07, "loss": 0.0025, "reward": 1.7611924409866333, "reward_std": 0.10708035714924335, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7690049111843109, "step": 2477 }, { "clip_ratio": 0.0, "completion_length": 274.8125, "epoch": 1.21044921875, "grad_norm": 4.270766248537916, "kl": 0.07763671875, "learning_rate": 6.975097656249999e-07, "loss": 0.0031, "reward": 1.814048945903778, "reward_std": 0.05252527166157961, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8140489459037781, "step": 2478 }, { "clip_ratio": 0.0, "completion_length": 367.5390625, "epoch": 1.2109375, "grad_norm": 2.2657492845126486, "kl": 0.0574951171875, "learning_rate": 6.973876953124999e-07, "loss": 0.0023, "reward": 1.809904932975769, "reward_std": 0.05868878960609436, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8099049031734467, "step": 2479 }, { "clip_ratio": 0.0, "completion_length": 325.09375, "epoch": 1.21142578125, "grad_norm": 4.6466935324712235, "kl": 0.0562744140625, "learning_rate": 6.97265625e-07, "loss": 0.0023, "reward": 1.7215197086334229, "reward_std": 0.07658331096172333, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7762071788311005, "step": 2480 }, { "clip_ratio": 0.0, "completion_length": 258.0703125, "epoch": 1.2119140625, "grad_norm": 14.953714286957666, "kl": 0.133056640625, "learning_rate": 6.971435546875e-07, "loss": 0.0053, "reward": 1.7927291989326477, "reward_std": 0.027087991125881672, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7927291989326477, "step": 2481 }, { "clip_ratio": 0.0, "completion_length": 364.1015625, "epoch": 1.21240234375, "grad_norm": 5.068041510063571, "kl": 0.0521240234375, "learning_rate": 6.97021484375e-07, "loss": 0.0021, "reward": 1.7825981974601746, "reward_std": 0.08510758727788925, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.790410727262497, "step": 2482 }, { "clip_ratio": 0.0, "completion_length": 250.5546875, "epoch": 1.212890625, "grad_norm": 1.2049058380875817, "kl": 0.0714111328125, "learning_rate": 6.968994140625e-07, "loss": 0.0029, "reward": 1.8188948035240173, "reward_std": 0.025680112652480602, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8188948333263397, "step": 2483 }, { "clip_ratio": 0.0, "completion_length": 383.7578125, "epoch": 1.21337890625, "grad_norm": 7.356014600027885, "kl": 0.069580078125, "learning_rate": 6.967773437499999e-07, "loss": 0.0028, "reward": 1.7450536489486694, "reward_std": 0.1266886219382286, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.752866119146347, "step": 2484 }, { "clip_ratio": 0.0, "completion_length": 353.0, "epoch": 1.2138671875, "grad_norm": 3.1051199689401043, "kl": 0.0675048828125, "learning_rate": 6.966552734374999e-07, "loss": 0.0027, "reward": 1.6235730051994324, "reward_std": 0.11907243356108665, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6470105350017548, "step": 2485 }, { "clip_ratio": 0.0, "completion_length": 364.953125, "epoch": 1.21435546875, "grad_norm": 1.6944501406666026, "kl": 0.069580078125, "learning_rate": 6.96533203125e-07, "loss": 0.0028, "reward": 1.8035091161727905, "reward_std": 0.07174506038427353, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8035090863704681, "step": 2486 }, { "clip_ratio": 0.0, "completion_length": 314.4140625, "epoch": 1.21484375, "grad_norm": 1.2360053249882443, "kl": 0.0732421875, "learning_rate": 6.964111328125e-07, "loss": 0.0029, "reward": 1.6692347526550293, "reward_std": 0.04010845720767975, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6692347228527069, "step": 2487 }, { "clip_ratio": 0.0, "completion_length": 333.6953125, "epoch": 1.21533203125, "grad_norm": 1.0885353537961944, "kl": 0.0716552734375, "learning_rate": 6.962890625e-07, "loss": 0.0029, "reward": 1.7111193537712097, "reward_std": 0.09523628279566765, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7189318835735321, "step": 2488 }, { "clip_ratio": 0.0, "completion_length": 317.7265625, "epoch": 1.2158203125, "grad_norm": 1.2668470125667208, "kl": 0.05712890625, "learning_rate": 6.961669921875e-07, "loss": 0.0023, "reward": 1.6149799227714539, "reward_std": 0.12601268105208874, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6384174823760986, "step": 2489 }, { "clip_ratio": 0.0, "completion_length": 263.703125, "epoch": 1.21630859375, "grad_norm": 1.7178893277816036, "kl": 0.0589599609375, "learning_rate": 6.96044921875e-07, "loss": 0.0024, "reward": 1.8464585542678833, "reward_std": 0.056060753762722015, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8464585840702057, "step": 2490 }, { "clip_ratio": 0.0, "completion_length": 291.734375, "epoch": 1.216796875, "grad_norm": 4.712548901479162, "kl": 0.0654296875, "learning_rate": 6.959228515624999e-07, "loss": 0.0026, "reward": 1.6734269857406616, "reward_std": 0.09399673715233803, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6734269857406616, "step": 2491 }, { "clip_ratio": 0.0, "completion_length": 297.765625, "epoch": 1.21728515625, "grad_norm": 1.0758561945731477, "kl": 0.0693359375, "learning_rate": 6.958007812499999e-07, "loss": 0.0028, "reward": 1.6628954410552979, "reward_std": 0.042761145159602165, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6628954112529755, "step": 2492 }, { "clip_ratio": 0.0, "completion_length": 241.6484375, "epoch": 1.2177734375, "grad_norm": 1.1584941663597192, "kl": 0.076171875, "learning_rate": 6.956787109375e-07, "loss": 0.003, "reward": 1.7079687118530273, "reward_std": 0.0789231238886714, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7079687416553497, "step": 2493 }, { "clip_ratio": 0.0, "completion_length": 306.3359375, "epoch": 1.21826171875, "grad_norm": 1.964653097717732, "kl": 0.062255859375, "learning_rate": 6.95556640625e-07, "loss": 0.0025, "reward": 1.6944482326507568, "reward_std": 0.03738341759890318, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6944482922554016, "step": 2494 }, { "clip_ratio": 0.0, "completion_length": 349.96875, "epoch": 1.21875, "grad_norm": 3.3640026796745857, "kl": 0.0626220703125, "learning_rate": 6.954345703125e-07, "loss": 0.0025, "reward": 1.784760594367981, "reward_std": 0.06138443388044834, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.784760594367981, "step": 2495 }, { "clip_ratio": 0.0, "completion_length": 313.4453125, "epoch": 1.21923828125, "grad_norm": 3.2144653678548583, "kl": 0.0732421875, "learning_rate": 6.953125e-07, "loss": 0.0029, "reward": 1.5434470176696777, "reward_std": 0.14664818346500397, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5590719878673553, "step": 2496 }, { "clip_ratio": 0.0, "completion_length": 234.5625, "epoch": 1.2197265625, "grad_norm": 1.6026752821456773, "kl": 0.0711669921875, "learning_rate": 6.951904296874999e-07, "loss": 0.0029, "reward": 1.719383180141449, "reward_std": 0.06268875673413277, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7193832099437714, "step": 2497 }, { "clip_ratio": 0.0, "completion_length": 258.8984375, "epoch": 1.22021484375, "grad_norm": 2.7339715672324685, "kl": 0.09521484375, "learning_rate": 6.950683593749999e-07, "loss": 0.0038, "reward": 1.6224533915519714, "reward_std": 0.03794710151851177, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6224533319473267, "step": 2498 }, { "clip_ratio": 0.0, "completion_length": 292.6171875, "epoch": 1.220703125, "grad_norm": 3.3456055155414854, "kl": 0.094970703125, "learning_rate": 6.949462890625e-07, "loss": 0.0038, "reward": 1.687516987323761, "reward_std": 0.04205773863941431, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.687516987323761, "step": 2499 }, { "clip_ratio": 0.0, "completion_length": 330.96875, "epoch": 1.22119140625, "grad_norm": 1.410321483173382, "kl": 0.0670166015625, "learning_rate": 6.9482421875e-07, "loss": 0.0027, "reward": 1.784384846687317, "reward_std": 0.03612975589931011, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7843847870826721, "step": 2500 }, { "clip_ratio": 0.0, "completion_length": 300.984375, "epoch": 1.2216796875, "grad_norm": 4.366344762597936, "kl": 0.078857421875, "learning_rate": 6.947021484375e-07, "loss": 0.0032, "reward": 1.7177372574806213, "reward_std": 0.1302860602736473, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7255498170852661, "step": 2501 }, { "clip_ratio": 0.0, "completion_length": 243.3671875, "epoch": 1.22216796875, "grad_norm": 1.39273870980327, "kl": 0.078125, "learning_rate": 6.94580078125e-07, "loss": 0.0031, "reward": 1.708004117012024, "reward_std": 0.06258507259190083, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7392540872097015, "step": 2502 }, { "clip_ratio": 0.0, "completion_length": 303.015625, "epoch": 1.22265625, "grad_norm": 2.5625187589958376, "kl": 0.08349609375, "learning_rate": 6.944580078125e-07, "loss": 0.0033, "reward": 1.8098444938659668, "reward_std": 0.09694074839353561, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8176570236682892, "step": 2503 }, { "clip_ratio": 0.0, "completion_length": 396.3984375, "epoch": 1.22314453125, "grad_norm": 2.0931664616747683, "kl": 0.0699462890625, "learning_rate": 6.943359374999999e-07, "loss": 0.0028, "reward": 1.6923267245292664, "reward_std": 0.16218779981136322, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7157641649246216, "step": 2504 }, { "clip_ratio": 0.0, "completion_length": 291.609375, "epoch": 1.2236328125, "grad_norm": 1.3978025191646846, "kl": 0.0634765625, "learning_rate": 6.942138671874999e-07, "loss": 0.0025, "reward": 1.6496607065200806, "reward_std": 0.08597181178629398, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6652857065200806, "step": 2505 }, { "clip_ratio": 0.0, "completion_length": 323.234375, "epoch": 1.22412109375, "grad_norm": 2.1379914333546512, "kl": 0.06591796875, "learning_rate": 6.94091796875e-07, "loss": 0.0026, "reward": 1.7624231576919556, "reward_std": 0.12387410178780556, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7780481576919556, "step": 2506 }, { "clip_ratio": 0.0, "completion_length": 394.0234375, "epoch": 1.224609375, "grad_norm": 1.1228065728537064, "kl": 0.0565185546875, "learning_rate": 6.939697265625e-07, "loss": 0.0023, "reward": 1.6814470887184143, "reward_std": 0.021641411818563938, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6814470887184143, "step": 2507 }, { "clip_ratio": 0.0, "completion_length": 377.609375, "epoch": 1.22509765625, "grad_norm": 0.8026004860091579, "kl": 0.0560302734375, "learning_rate": 6.9384765625e-07, "loss": 0.0022, "reward": 1.842549443244934, "reward_std": 0.04153232369571924, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8425493836402893, "step": 2508 }, { "clip_ratio": 0.0, "completion_length": 332.3828125, "epoch": 1.2255859375, "grad_norm": 2.430656263181087, "kl": 0.077880859375, "learning_rate": 6.937255859375e-07, "loss": 0.0031, "reward": 1.8032140135765076, "reward_std": 0.0743367203976959, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.81102654337883, "step": 2509 }, { "clip_ratio": 0.0, "completion_length": 268.3828125, "epoch": 1.22607421875, "grad_norm": 1.168922825721485, "kl": 0.08544921875, "learning_rate": 6.936035156249999e-07, "loss": 0.0034, "reward": 1.6974853873252869, "reward_std": 0.08852525055408478, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7521729171276093, "step": 2510 }, { "clip_ratio": 0.0, "completion_length": 270.8984375, "epoch": 1.2265625, "grad_norm": 8.023737021131817, "kl": 0.08056640625, "learning_rate": 6.934814453124999e-07, "loss": 0.0032, "reward": 1.7808299660682678, "reward_std": 0.04870981816202402, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.780829906463623, "step": 2511 }, { "clip_ratio": 0.0, "completion_length": 353.8203125, "epoch": 1.22705078125, "grad_norm": 3.438619709615711, "kl": 0.069580078125, "learning_rate": 6.93359375e-07, "loss": 0.0028, "reward": 1.6688467860221863, "reward_std": 0.12142006307840347, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6766592860221863, "step": 2512 }, { "clip_ratio": 0.0, "completion_length": 287.1796875, "epoch": 1.2275390625, "grad_norm": 2.202825505955737, "kl": 0.0712890625, "learning_rate": 6.932373046875e-07, "loss": 0.0028, "reward": 1.6930432319641113, "reward_std": 0.06635242141783237, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7008557617664337, "step": 2513 }, { "clip_ratio": 0.0, "completion_length": 242.890625, "epoch": 1.22802734375, "grad_norm": 1.4339804333793391, "kl": 0.078857421875, "learning_rate": 6.93115234375e-07, "loss": 0.0032, "reward": 1.764868676662445, "reward_std": 0.05308605916798115, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7648686468601227, "step": 2514 }, { "clip_ratio": 0.0, "completion_length": 296.2734375, "epoch": 1.228515625, "grad_norm": 3.6539966716350767, "kl": 0.076416015625, "learning_rate": 6.929931640625e-07, "loss": 0.0031, "reward": 1.7111204266548157, "reward_std": 0.04776516975834966, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7111203968524933, "step": 2515 }, { "clip_ratio": 0.0, "completion_length": 273.9296875, "epoch": 1.22900390625, "grad_norm": 1.3654858706509436, "kl": 0.0810546875, "learning_rate": 6.9287109375e-07, "loss": 0.0032, "reward": 1.8311191201210022, "reward_std": 0.03589681722223759, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8311191201210022, "step": 2516 }, { "clip_ratio": 0.0, "completion_length": 345.109375, "epoch": 1.2294921875, "grad_norm": 1.5725270743331548, "kl": 0.07763671875, "learning_rate": 6.927490234374999e-07, "loss": 0.0031, "reward": 1.5551150441169739, "reward_std": 0.0790153406560421, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6019900143146515, "step": 2517 }, { "clip_ratio": 0.0, "completion_length": 306.3359375, "epoch": 1.22998046875, "grad_norm": 3.1962807812262266, "kl": 0.0567626953125, "learning_rate": 6.926269531249999e-07, "loss": 0.0023, "reward": 1.6948148608207703, "reward_std": 0.059128282591700554, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.694814920425415, "step": 2518 }, { "clip_ratio": 0.0, "completion_length": 283.8828125, "epoch": 1.23046875, "grad_norm": 1.986553003861319, "kl": 0.0697021484375, "learning_rate": 6.925048828125e-07, "loss": 0.0028, "reward": 1.7517194151878357, "reward_std": 0.04597326088696718, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7517193853855133, "step": 2519 }, { "clip_ratio": 0.0, "completion_length": 345.4921875, "epoch": 1.23095703125, "grad_norm": 1.7190974829655477, "kl": 0.072509765625, "learning_rate": 6.923828125e-07, "loss": 0.0029, "reward": 1.6288211941719055, "reward_std": 0.025298184249550104, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6288211941719055, "step": 2520 }, { "clip_ratio": 0.0, "completion_length": 333.390625, "epoch": 1.2314453125, "grad_norm": 3.1290421341569092, "kl": 0.079345703125, "learning_rate": 6.922607421875e-07, "loss": 0.0032, "reward": 1.8155579566955566, "reward_std": 0.10759843979030848, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.823370486497879, "step": 2521 }, { "clip_ratio": 0.0, "completion_length": 281.71875, "epoch": 1.23193359375, "grad_norm": 1.2667127723104277, "kl": 0.071533203125, "learning_rate": 6.92138671875e-07, "loss": 0.0029, "reward": 1.737762212753296, "reward_std": 0.03363693691790104, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7377622127532959, "step": 2522 }, { "clip_ratio": 0.0, "completion_length": 249.4765625, "epoch": 1.232421875, "grad_norm": 3.050787985177319, "kl": 0.07470703125, "learning_rate": 6.920166015624999e-07, "loss": 0.003, "reward": 1.8481884598731995, "reward_std": 0.06605091877281666, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8560009598731995, "step": 2523 }, { "clip_ratio": 0.0, "completion_length": 318.7421875, "epoch": 1.23291015625, "grad_norm": 2.338599161344737, "kl": 0.065185546875, "learning_rate": 6.918945312499999e-07, "loss": 0.0026, "reward": 1.784920573234558, "reward_std": 0.06366929598152637, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7927330732345581, "step": 2524 }, { "clip_ratio": 0.0, "completion_length": 237.25, "epoch": 1.2333984375, "grad_norm": 5.581950906370106, "kl": 0.0888671875, "learning_rate": 6.917724609375e-07, "loss": 0.0036, "reward": 1.7415488362312317, "reward_std": 0.04909018334001303, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7415488362312317, "step": 2525 }, { "clip_ratio": 0.0, "completion_length": 306.796875, "epoch": 1.23388671875, "grad_norm": 2.231397610748833, "kl": 0.09130859375, "learning_rate": 6.91650390625e-07, "loss": 0.0037, "reward": 1.8085330724716187, "reward_std": 0.06506985053420067, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8163455426692963, "step": 2526 }, { "clip_ratio": 0.0, "completion_length": 217.4375, "epoch": 1.234375, "grad_norm": 1.557598316678241, "kl": 0.0849609375, "learning_rate": 6.915283203125e-07, "loss": 0.0034, "reward": 1.771598756313324, "reward_std": 0.036864256486296654, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.771598756313324, "step": 2527 }, { "clip_ratio": 0.0, "completion_length": 226.5859375, "epoch": 1.23486328125, "grad_norm": 8.050764518205462, "kl": 0.108642578125, "learning_rate": 6.9140625e-07, "loss": 0.0044, "reward": 1.6949394345283508, "reward_std": 0.11935023218393326, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6949394047260284, "step": 2528 }, { "clip_ratio": 0.0, "completion_length": 275.75, "epoch": 1.2353515625, "grad_norm": 3.577749415144858, "kl": 0.072509765625, "learning_rate": 6.912841796875e-07, "loss": 0.0029, "reward": 1.7945581078529358, "reward_std": 0.026416001841425896, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7945581078529358, "step": 2529 }, { "clip_ratio": 0.0, "completion_length": 291.1171875, "epoch": 1.23583984375, "grad_norm": 1.882919193494393, "kl": 0.097900390625, "learning_rate": 6.911621093749999e-07, "loss": 0.0039, "reward": 1.7366108298301697, "reward_std": 0.088971808552742, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7444233596324921, "step": 2530 }, { "clip_ratio": 0.0, "completion_length": 312.9921875, "epoch": 1.236328125, "grad_norm": 1.6856414658348984, "kl": 0.072998046875, "learning_rate": 6.910400390624999e-07, "loss": 0.0029, "reward": 1.7254244089126587, "reward_std": 0.08224152028560638, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7332369983196259, "step": 2531 }, { "clip_ratio": 0.0, "completion_length": 260.0625, "epoch": 1.23681640625, "grad_norm": 2.340077383886054, "kl": 0.075927734375, "learning_rate": 6.9091796875e-07, "loss": 0.003, "reward": 1.8460680842399597, "reward_std": 0.06705048866569996, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8460681140422821, "step": 2532 }, { "clip_ratio": 0.0, "completion_length": 272.0390625, "epoch": 1.2373046875, "grad_norm": 1.9943965668932906, "kl": 0.110595703125, "learning_rate": 6.907958984375e-07, "loss": 0.0044, "reward": 1.775262475013733, "reward_std": 0.06797738745808601, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7752624750137329, "step": 2533 }, { "clip_ratio": 0.0, "completion_length": 309.0625, "epoch": 1.23779296875, "grad_norm": 2.2221758125368467, "kl": 0.067626953125, "learning_rate": 6.90673828125e-07, "loss": 0.0027, "reward": 1.8863377571105957, "reward_std": 0.058480268344283104, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8863376975059509, "step": 2534 }, { "clip_ratio": 0.0, "completion_length": 359.5546875, "epoch": 1.23828125, "grad_norm": 2.2548699663415714, "kl": 0.0633544921875, "learning_rate": 6.905517578125e-07, "loss": 0.0025, "reward": 1.9195441007614136, "reward_std": 0.04364974796772003, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9195441007614136, "step": 2535 }, { "clip_ratio": 0.0, "completion_length": 295.171875, "epoch": 1.23876953125, "grad_norm": 4.280833973797064, "kl": 0.0657958984375, "learning_rate": 6.904296875e-07, "loss": 0.0026, "reward": 1.8286893963813782, "reward_std": 0.1052445936948061, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8365019261837006, "step": 2536 }, { "clip_ratio": 0.0, "completion_length": 212.0390625, "epoch": 1.2392578125, "grad_norm": 1.0725990560276126, "kl": 0.093505859375, "learning_rate": 6.903076171874999e-07, "loss": 0.0037, "reward": 1.7462586760520935, "reward_std": 0.01885821617906913, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7462586760520935, "step": 2537 }, { "clip_ratio": 0.0, "completion_length": 281.4296875, "epoch": 1.23974609375, "grad_norm": 0.5566830426478924, "kl": 0.069580078125, "learning_rate": 6.90185546875e-07, "loss": 0.0028, "reward": 1.7841619849205017, "reward_std": 0.020260846242308617, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7841619849205017, "step": 2538 }, { "clip_ratio": 0.0, "completion_length": 389.453125, "epoch": 1.240234375, "grad_norm": 1.4564201896065856, "kl": 0.055419921875, "learning_rate": 6.900634765625e-07, "loss": 0.0022, "reward": 1.7457255721092224, "reward_std": 0.10040692985057831, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7535381019115448, "step": 2539 }, { "clip_ratio": 0.0, "completion_length": 260.90625, "epoch": 1.24072265625, "grad_norm": 0.9193644485156054, "kl": 0.07958984375, "learning_rate": 6.8994140625e-07, "loss": 0.0032, "reward": 1.5869048237800598, "reward_std": 0.060834175907075405, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6025297790765762, "step": 2540 }, { "clip_ratio": 0.0, "completion_length": 284.53125, "epoch": 1.2412109375, "grad_norm": 2.0003891138751797, "kl": 0.0830078125, "learning_rate": 6.898193359375e-07, "loss": 0.0033, "reward": 1.7775180339813232, "reward_std": 0.09747044742107391, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7775180339813232, "step": 2541 }, { "clip_ratio": 0.0, "completion_length": 277.953125, "epoch": 1.24169921875, "grad_norm": 1.241308895200411, "kl": 0.0694580078125, "learning_rate": 6.89697265625e-07, "loss": 0.0028, "reward": 1.6753292679786682, "reward_std": 0.07925521302968264, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6909542381763458, "step": 2542 }, { "clip_ratio": 0.0, "completion_length": 280.4375, "epoch": 1.2421875, "grad_norm": 7.373580663731735, "kl": 0.083251953125, "learning_rate": 6.895751953124999e-07, "loss": 0.0033, "reward": 1.7385675311088562, "reward_std": 0.03642314299941063, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7385675311088562, "step": 2543 }, { "clip_ratio": 0.0, "completion_length": 303.8046875, "epoch": 1.24267578125, "grad_norm": 2.6589438244356502, "kl": 0.078125, "learning_rate": 6.894531249999999e-07, "loss": 0.0031, "reward": 1.6425248980522156, "reward_std": 0.05202796123921871, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6425248980522156, "step": 2544 }, { "clip_ratio": 0.0, "completion_length": 256.640625, "epoch": 1.2431640625, "grad_norm": 1.7535877983308272, "kl": 0.085205078125, "learning_rate": 6.893310546875e-07, "loss": 0.0034, "reward": 1.7897852659225464, "reward_std": 0.06561807543039322, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7897853255271912, "step": 2545 }, { "clip_ratio": 0.0, "completion_length": 267.546875, "epoch": 1.24365234375, "grad_norm": 1.1188283700852508, "kl": 0.061767578125, "learning_rate": 6.89208984375e-07, "loss": 0.0025, "reward": 1.8053843975067139, "reward_std": 0.03231469355523586, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8053844273090363, "step": 2546 }, { "clip_ratio": 0.0, "completion_length": 317.90625, "epoch": 1.244140625, "grad_norm": 2.9696598961961973, "kl": 0.0638427734375, "learning_rate": 6.890869140625e-07, "loss": 0.0026, "reward": 1.629117488861084, "reward_std": 0.0660979188978672, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6291175484657288, "step": 2547 }, { "clip_ratio": 0.0, "completion_length": 229.4375, "epoch": 1.24462890625, "grad_norm": 2.209328842454915, "kl": 0.07373046875, "learning_rate": 6.8896484375e-07, "loss": 0.0029, "reward": 1.7065168619155884, "reward_std": 0.02056479558814317, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7065168619155884, "step": 2548 }, { "clip_ratio": 0.0, "completion_length": 236.4765625, "epoch": 1.2451171875, "grad_norm": 2.1506170169340852, "kl": 0.0753173828125, "learning_rate": 6.888427734375e-07, "loss": 0.003, "reward": 1.7226258516311646, "reward_std": 0.06090010888874531, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7226258814334869, "step": 2549 }, { "clip_ratio": 0.0, "completion_length": 340.828125, "epoch": 1.24560546875, "grad_norm": 1.6867670137775563, "kl": 0.0570068359375, "learning_rate": 6.887207031249999e-07, "loss": 0.0023, "reward": 1.907860517501831, "reward_std": 0.13618198037147522, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.9156731367111206, "step": 2550 }, { "clip_ratio": 0.0, "completion_length": 274.4296875, "epoch": 1.24609375, "grad_norm": 2.014025847589191, "kl": 0.0748291015625, "learning_rate": 6.885986328124999e-07, "loss": 0.003, "reward": 1.8035182356834412, "reward_std": 0.10311203170567751, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8035181760787964, "step": 2551 }, { "clip_ratio": 0.0, "completion_length": 359.8671875, "epoch": 1.24658203125, "grad_norm": 2.460777748950473, "kl": 0.065185546875, "learning_rate": 6.884765625e-07, "loss": 0.0026, "reward": 1.7247642874717712, "reward_std": 0.0778743838891387, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7325767874717712, "step": 2552 }, { "clip_ratio": 0.0, "completion_length": 290.859375, "epoch": 1.2470703125, "grad_norm": 2.066296664154434, "kl": 0.0635986328125, "learning_rate": 6.883544921875e-07, "loss": 0.0025, "reward": 1.8594765067100525, "reward_std": 0.03086453676223755, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8594764471054077, "step": 2553 }, { "clip_ratio": 0.0, "completion_length": 288.734375, "epoch": 1.24755859375, "grad_norm": 1.129593989044106, "kl": 0.072265625, "learning_rate": 6.88232421875e-07, "loss": 0.0029, "reward": 1.8053827285766602, "reward_std": 0.08634701371192932, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8053827881813049, "step": 2554 }, { "clip_ratio": 0.0, "completion_length": 297.9609375, "epoch": 1.248046875, "grad_norm": 2.1481848196997033, "kl": 0.074462890625, "learning_rate": 6.881103515625e-07, "loss": 0.003, "reward": 1.8480090498924255, "reward_std": 0.052580492570996284, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8480090796947479, "step": 2555 }, { "clip_ratio": 0.0, "completion_length": 297.8359375, "epoch": 1.24853515625, "grad_norm": 1.3215526698458806, "kl": 0.072265625, "learning_rate": 6.879882812499999e-07, "loss": 0.0029, "reward": 1.7874248027801514, "reward_std": 0.04134686943143606, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7874249219894409, "step": 2556 }, { "clip_ratio": 0.0, "completion_length": 326.796875, "epoch": 1.2490234375, "grad_norm": 7.161134007570032, "kl": 0.076904296875, "learning_rate": 6.878662109374999e-07, "loss": 0.0031, "reward": 1.7982208728790283, "reward_std": 0.05789235234260559, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7982209324836731, "step": 2557 }, { "clip_ratio": 0.0, "completion_length": 352.59375, "epoch": 1.24951171875, "grad_norm": 1.5826957329843547, "kl": 0.086181640625, "learning_rate": 6.87744140625e-07, "loss": 0.0034, "reward": 1.724461555480957, "reward_std": 0.05809208191931248, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7244615852832794, "step": 2558 }, { "clip_ratio": 0.0, "completion_length": 427.8984375, "epoch": 1.25, "grad_norm": 0.6970013528941209, "kl": 0.0509033203125, "learning_rate": 6.876220703125e-07, "loss": 0.002, "reward": 1.8204131126403809, "reward_std": 0.08120441623032093, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8282255232334137, "step": 2559 }, { "clip_ratio": 0.0, "completion_length": 287.75, "epoch": 1.25048828125, "grad_norm": 0.9949031525763067, "kl": 0.080810546875, "learning_rate": 6.875e-07, "loss": 0.0032, "reward": 1.85645192861557, "reward_std": 0.05746803432703018, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8564519584178925, "step": 2560 }, { "clip_ratio": 0.0, "completion_length": 338.1640625, "epoch": 1.2509765625, "grad_norm": 1.5878015011305773, "kl": 0.0599365234375, "learning_rate": 6.873779296875e-07, "loss": 0.0024, "reward": 1.7821994423866272, "reward_std": 0.09675415605306625, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.790012001991272, "step": 2561 }, { "clip_ratio": 0.0, "completion_length": 302.453125, "epoch": 1.25146484375, "grad_norm": 2.5951982218621885, "kl": 0.092041015625, "learning_rate": 6.87255859375e-07, "loss": 0.0037, "reward": 1.6413246393203735, "reward_std": 0.07367514073848724, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6413246095180511, "step": 2562 }, { "clip_ratio": 0.0, "completion_length": 299.953125, "epoch": 1.251953125, "grad_norm": 2.1313560894780688, "kl": 0.06494140625, "learning_rate": 6.871337890624999e-07, "loss": 0.0026, "reward": 1.9887361526489258, "reward_std": 0.059286823496222496, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9887359738349915, "step": 2563 }, { "clip_ratio": 0.0, "completion_length": 264.2890625, "epoch": 1.25244140625, "grad_norm": 0.9087564610351305, "kl": 0.071044921875, "learning_rate": 6.870117187499999e-07, "loss": 0.0028, "reward": 1.880197525024414, "reward_std": 0.05455988273024559, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8801974654197693, "step": 2564 }, { "clip_ratio": 0.0, "completion_length": 241.7890625, "epoch": 1.2529296875, "grad_norm": 2.3160382343134533, "kl": 0.097412109375, "learning_rate": 6.868896484375e-07, "loss": 0.0039, "reward": 1.6566903591156006, "reward_std": 0.09503332898020744, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6566903293132782, "step": 2565 }, { "clip_ratio": 0.0, "completion_length": 253.0, "epoch": 1.25341796875, "grad_norm": 2.2293864959901772, "kl": 0.06982421875, "learning_rate": 6.86767578125e-07, "loss": 0.0028, "reward": 1.8191727995872498, "reward_std": 0.07018731534481049, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8191727995872498, "step": 2566 }, { "clip_ratio": 0.0, "completion_length": 237.0078125, "epoch": 1.25390625, "grad_norm": 3.352890296035172, "kl": 0.076904296875, "learning_rate": 6.866455078125e-07, "loss": 0.0031, "reward": 1.7632625102996826, "reward_std": 0.052378684282302856, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7632625102996826, "step": 2567 }, { "clip_ratio": 0.0, "completion_length": 225.3125, "epoch": 1.25439453125, "grad_norm": 3.1814122296646916, "kl": 0.0689697265625, "learning_rate": 6.865234375e-07, "loss": 0.0028, "reward": 1.743474543094635, "reward_std": 0.059906596317887306, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7434745132923126, "step": 2568 }, { "clip_ratio": 0.0, "completion_length": 277.859375, "epoch": 1.2548828125, "grad_norm": 1.2044355795614614, "kl": 0.0648193359375, "learning_rate": 6.864013671874999e-07, "loss": 0.0026, "reward": 1.7757219076156616, "reward_std": 0.05339077487587929, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7757218182086945, "step": 2569 }, { "clip_ratio": 0.0, "completion_length": 318.734375, "epoch": 1.25537109375, "grad_norm": 1.50396583424587, "kl": 0.0584716796875, "learning_rate": 6.862792968749999e-07, "loss": 0.0023, "reward": 1.8511592745780945, "reward_std": 0.06993940658867359, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8511592745780945, "step": 2570 }, { "clip_ratio": 0.0, "completion_length": 217.3203125, "epoch": 1.255859375, "grad_norm": 7.325974862571428, "kl": 0.080078125, "learning_rate": 6.861572265625e-07, "loss": 0.0032, "reward": 1.8245378136634827, "reward_std": 0.05453048273921013, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8245378732681274, "step": 2571 }, { "clip_ratio": 0.0, "completion_length": 289.875, "epoch": 1.25634765625, "grad_norm": 2.5748832751879034, "kl": 0.073486328125, "learning_rate": 6.8603515625e-07, "loss": 0.0029, "reward": 1.7536067962646484, "reward_std": 0.07038977555930614, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7536068260669708, "step": 2572 }, { "clip_ratio": 0.0, "completion_length": 332.296875, "epoch": 1.2568359375, "grad_norm": 2.2116366377933505, "kl": 0.05859375, "learning_rate": 6.859130859375e-07, "loss": 0.0023, "reward": 1.7995004653930664, "reward_std": 0.09891559928655624, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8073129653930664, "step": 2573 }, { "clip_ratio": 0.0, "completion_length": 288.78125, "epoch": 1.25732421875, "grad_norm": 1.123092523187175, "kl": 0.06787109375, "learning_rate": 6.85791015625e-07, "loss": 0.0027, "reward": 1.8330579996109009, "reward_std": 0.045371233485639095, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8330580592155457, "step": 2574 }, { "clip_ratio": 0.0, "completion_length": 286.2734375, "epoch": 1.2578125, "grad_norm": 2.1695911445553513, "kl": 0.06787109375, "learning_rate": 6.856689453125e-07, "loss": 0.0027, "reward": 1.8154324889183044, "reward_std": 0.04039592668414116, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8154324889183044, "step": 2575 }, { "clip_ratio": 0.0, "completion_length": 337.015625, "epoch": 1.25830078125, "grad_norm": 1.1977765178723714, "kl": 0.0513916015625, "learning_rate": 6.855468749999999e-07, "loss": 0.0021, "reward": 1.7662554383277893, "reward_std": 0.008739282377064228, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7662554383277893, "step": 2576 }, { "clip_ratio": 0.0, "completion_length": 320.625, "epoch": 1.2587890625, "grad_norm": 2.582999038354569, "kl": 0.05078125, "learning_rate": 6.854248046874999e-07, "loss": 0.002, "reward": 1.8732368350028992, "reward_std": 0.13687162101268768, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8810493648052216, "step": 2577 }, { "clip_ratio": 0.0, "completion_length": 346.1875, "epoch": 1.25927734375, "grad_norm": 3.2603077110393186, "kl": 0.07080078125, "learning_rate": 6.85302734375e-07, "loss": 0.0028, "reward": 1.744448721408844, "reward_std": 0.07884471863508224, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7444487512111664, "step": 2578 }, { "clip_ratio": 0.0, "completion_length": 268.2734375, "epoch": 1.259765625, "grad_norm": 1.5072963778520792, "kl": 0.072265625, "learning_rate": 6.851806640625e-07, "loss": 0.0029, "reward": 1.8110605478286743, "reward_std": 0.0798899196088314, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8110604882240295, "step": 2579 }, { "clip_ratio": 0.0, "completion_length": 254.109375, "epoch": 1.26025390625, "grad_norm": 6.052899624466231, "kl": 0.1295166015625, "learning_rate": 6.8505859375e-07, "loss": 0.0052, "reward": 1.855854094028473, "reward_std": 0.03883726242929697, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8558541238307953, "step": 2580 }, { "clip_ratio": 0.0, "completion_length": 229.171875, "epoch": 1.2607421875, "grad_norm": 2.2670697938409052, "kl": 0.0751953125, "learning_rate": 6.849365234375e-07, "loss": 0.003, "reward": 1.5954683423042297, "reward_std": 0.01611372921615839, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5954683721065521, "step": 2581 }, { "clip_ratio": 0.0, "completion_length": 307.5234375, "epoch": 1.26123046875, "grad_norm": 9.369240905527494, "kl": 0.06787109375, "learning_rate": 6.848144531249999e-07, "loss": 0.0027, "reward": 1.7278264164924622, "reward_std": 0.05693458020687103, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7278264462947845, "step": 2582 }, { "clip_ratio": 0.0, "completion_length": 267.953125, "epoch": 1.26171875, "grad_norm": 1.2283989647554525, "kl": 0.0574951171875, "learning_rate": 6.846923828124999e-07, "loss": 0.0023, "reward": 1.7502474188804626, "reward_std": 0.026749521493911743, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.750247448682785, "step": 2583 }, { "clip_ratio": 0.0, "completion_length": 298.6875, "epoch": 1.26220703125, "grad_norm": 1.2610728333774404, "kl": 0.06201171875, "learning_rate": 6.845703125e-07, "loss": 0.0025, "reward": 1.7522244453430176, "reward_std": 0.09181947819888592, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7522244453430176, "step": 2584 }, { "clip_ratio": 0.0, "completion_length": 307.3828125, "epoch": 1.2626953125, "grad_norm": 2.414972278703281, "kl": 0.06396484375, "learning_rate": 6.844482421875e-07, "loss": 0.0026, "reward": 1.737687885761261, "reward_std": 0.07903081178665161, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.737687885761261, "step": 2585 }, { "clip_ratio": 0.0, "completion_length": 272.75, "epoch": 1.26318359375, "grad_norm": 1.7270948683315666, "kl": 0.04931640625, "learning_rate": 6.84326171875e-07, "loss": 0.002, "reward": 1.781536877155304, "reward_std": 0.0434822803363204, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7815368473529816, "step": 2586 }, { "clip_ratio": 0.0, "completion_length": 356.40625, "epoch": 1.263671875, "grad_norm": 1.3949112846676868, "kl": 0.0535888671875, "learning_rate": 6.842041015625e-07, "loss": 0.0021, "reward": 1.8454834818840027, "reward_std": 0.05372583121061325, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8454834818840027, "step": 2587 }, { "clip_ratio": 0.0, "completion_length": 261.84375, "epoch": 1.26416015625, "grad_norm": 1.4368123149876182, "kl": 0.0692138671875, "learning_rate": 6.8408203125e-07, "loss": 0.0028, "reward": 1.6451177597045898, "reward_std": 0.07861702609807253, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6763677000999451, "step": 2588 }, { "clip_ratio": 0.0, "completion_length": 269.125, "epoch": 1.2646484375, "grad_norm": 2.8027515559043823, "kl": 0.068603515625, "learning_rate": 6.839599609374999e-07, "loss": 0.0027, "reward": 1.863362193107605, "reward_std": 0.08033762127161026, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.871174693107605, "step": 2589 }, { "clip_ratio": 0.0, "completion_length": 303.65625, "epoch": 1.26513671875, "grad_norm": 1.3661088668145678, "kl": 0.071533203125, "learning_rate": 6.838378906249999e-07, "loss": 0.0029, "reward": 1.7393322587013245, "reward_std": 0.052713219076395035, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7393322587013245, "step": 2590 }, { "clip_ratio": 0.0, "completion_length": 285.53125, "epoch": 1.265625, "grad_norm": 1.1496627965247463, "kl": 0.05908203125, "learning_rate": 6.837158203125e-07, "loss": 0.0024, "reward": 1.831416666507721, "reward_std": 0.06289010029286146, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8314166367053986, "step": 2591 }, { "clip_ratio": 0.0, "completion_length": 313.3359375, "epoch": 1.26611328125, "grad_norm": 1.1102501988463187, "kl": 0.0625, "learning_rate": 6.8359375e-07, "loss": 0.0025, "reward": 1.9127737879753113, "reward_std": 0.09909685142338276, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.9362112879753113, "step": 2592 }, { "clip_ratio": 0.0, "completion_length": 266.5703125, "epoch": 1.2666015625, "grad_norm": 2.558242186997854, "kl": 0.07080078125, "learning_rate": 6.834716796875e-07, "loss": 0.0028, "reward": 1.7129462957382202, "reward_std": 0.2064364030957222, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.752008855342865, "step": 2593 }, { "clip_ratio": 0.0, "completion_length": 266.2421875, "epoch": 1.26708984375, "grad_norm": 1.4121290555246413, "kl": 0.06591796875, "learning_rate": 6.83349609375e-07, "loss": 0.0026, "reward": 1.721196711063385, "reward_std": 0.045722841285169125, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7211967408657074, "step": 2594 }, { "clip_ratio": 0.0, "completion_length": 275.6640625, "epoch": 1.267578125, "grad_norm": 8.542422233364945, "kl": 0.0450439453125, "learning_rate": 6.832275390624999e-07, "loss": 0.0018, "reward": 1.845105767250061, "reward_std": 0.04223616607487202, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8451057970523834, "step": 2595 }, { "clip_ratio": 0.0, "completion_length": 335.65625, "epoch": 1.26806640625, "grad_norm": 2.172156191986647, "kl": 0.05712890625, "learning_rate": 6.831054687499999e-07, "loss": 0.0023, "reward": 1.7219505906105042, "reward_std": 0.05229894071817398, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7219505310058594, "step": 2596 }, { "clip_ratio": 0.0, "completion_length": 312.2109375, "epoch": 1.2685546875, "grad_norm": 4.512629289400748, "kl": 0.07275390625, "learning_rate": 6.829833984375e-07, "loss": 0.0029, "reward": 1.775171935558319, "reward_std": 0.11626030504703522, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7751719057559967, "step": 2597 }, { "clip_ratio": 0.0, "completion_length": 236.8828125, "epoch": 1.26904296875, "grad_norm": 19.436272087132117, "kl": 0.0650634765625, "learning_rate": 6.82861328125e-07, "loss": 0.0026, "reward": 1.8984931111335754, "reward_std": 0.047796593979001045, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8984931111335754, "step": 2598 }, { "clip_ratio": 0.0, "completion_length": 203.1796875, "epoch": 1.26953125, "grad_norm": 0.8005281955093715, "kl": 0.0565185546875, "learning_rate": 6.827392578125e-07, "loss": 0.0023, "reward": 1.9395660758018494, "reward_std": 0.015174323692917824, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9395660758018494, "step": 2599 }, { "clip_ratio": 0.0, "completion_length": 294.0546875, "epoch": 1.27001953125, "grad_norm": 1.7873236969539745, "kl": 0.0601806640625, "learning_rate": 6.826171875e-07, "loss": 0.0024, "reward": 1.729775309562683, "reward_std": 0.11483496427536011, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7610252797603607, "step": 2600 }, { "clip_ratio": 0.0, "completion_length": 312.34375, "epoch": 1.2705078125, "grad_norm": 1.5860494003883214, "kl": 0.0635986328125, "learning_rate": 6.824951171875e-07, "loss": 0.0025, "reward": 1.619213342666626, "reward_std": 0.13867055252194405, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6504633724689484, "step": 2601 }, { "clip_ratio": 0.0, "completion_length": 234.5625, "epoch": 1.27099609375, "grad_norm": 1.8263568249389657, "kl": 0.0648193359375, "learning_rate": 6.823730468749999e-07, "loss": 0.0026, "reward": 1.7087448835372925, "reward_std": 0.059230593498796225, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7165573537349701, "step": 2602 }, { "clip_ratio": 0.0, "completion_length": 294.953125, "epoch": 1.271484375, "grad_norm": 2.021543206871508, "kl": 0.081298828125, "learning_rate": 6.822509765624999e-07, "loss": 0.0033, "reward": 1.6043951511383057, "reward_std": 0.11108111217617989, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6278325915336609, "step": 2603 }, { "clip_ratio": 0.0, "completion_length": 278.984375, "epoch": 1.27197265625, "grad_norm": 3.424869485308043, "kl": 0.072998046875, "learning_rate": 6.8212890625e-07, "loss": 0.0029, "reward": 1.9290322065353394, "reward_std": 0.13008323311805725, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.936844676733017, "step": 2604 }, { "clip_ratio": 0.0, "completion_length": 275.2734375, "epoch": 1.2724609375, "grad_norm": 0.8833183790651663, "kl": 0.0531005859375, "learning_rate": 6.820068359375e-07, "loss": 0.0021, "reward": 1.8816287517547607, "reward_std": 0.05253131175413728, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8894413113594055, "step": 2605 }, { "clip_ratio": 0.0, "completion_length": 407.9609375, "epoch": 1.27294921875, "grad_norm": 1.57460282033083, "kl": 0.0523681640625, "learning_rate": 6.81884765625e-07, "loss": 0.0021, "reward": 1.7124788165092468, "reward_std": 0.07889316231012344, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7202913463115692, "step": 2606 }, { "clip_ratio": 0.0, "completion_length": 215.6875, "epoch": 1.2734375, "grad_norm": 2.311076319577126, "kl": 0.060546875, "learning_rate": 6.817626953125e-07, "loss": 0.0024, "reward": 1.8965556025505066, "reward_std": 0.060334792360663414, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8965555429458618, "step": 2607 }, { "clip_ratio": 0.0, "completion_length": 388.203125, "epoch": 1.27392578125, "grad_norm": 2.312918341316606, "kl": 0.0556640625, "learning_rate": 6.816406249999999e-07, "loss": 0.0022, "reward": 1.6706057786941528, "reward_std": 0.15810733288526535, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.7331057786941528, "step": 2608 }, { "clip_ratio": 0.0, "completion_length": 275.6328125, "epoch": 1.2744140625, "grad_norm": 8.087988058753458, "kl": 0.05908203125, "learning_rate": 6.815185546874999e-07, "loss": 0.0024, "reward": 1.8144915699958801, "reward_std": 0.04744442366063595, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8144915997982025, "step": 2609 }, { "clip_ratio": 0.0, "completion_length": 326.875, "epoch": 1.27490234375, "grad_norm": 0.5706401219102212, "kl": 0.0506591796875, "learning_rate": 6.81396484375e-07, "loss": 0.002, "reward": 1.7547515630722046, "reward_std": 0.018506707157939672, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7547515630722046, "step": 2610 }, { "clip_ratio": 0.0, "completion_length": 277.4609375, "epoch": 1.275390625, "grad_norm": 12.49022768844792, "kl": 0.059326171875, "learning_rate": 6.812744140625e-07, "loss": 0.0024, "reward": 1.794619619846344, "reward_std": 0.06346526741981506, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.794619619846344, "step": 2611 }, { "clip_ratio": 0.0, "completion_length": 220.828125, "epoch": 1.27587890625, "grad_norm": 1.0850062950816326, "kl": 0.07958984375, "learning_rate": 6.8115234375e-07, "loss": 0.0032, "reward": 1.7299774289131165, "reward_std": 0.03509983792901039, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7299774289131165, "step": 2612 }, { "clip_ratio": 0.0, "completion_length": 226.71875, "epoch": 1.2763671875, "grad_norm": 1.080187026255585, "kl": 0.0640869140625, "learning_rate": 6.810302734375e-07, "loss": 0.0026, "reward": 1.8587305545806885, "reward_std": 0.07435241714119911, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8587304651737213, "step": 2613 }, { "clip_ratio": 0.0, "completion_length": 253.125, "epoch": 1.27685546875, "grad_norm": 1.1601920921582176, "kl": 0.060546875, "learning_rate": 6.80908203125e-07, "loss": 0.0024, "reward": 1.8313266038894653, "reward_std": 0.018857479095458984, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8313265740871429, "step": 2614 }, { "clip_ratio": 0.0, "completion_length": 293.34375, "epoch": 1.27734375, "grad_norm": 2.85155532714431, "kl": 0.0675048828125, "learning_rate": 6.807861328124999e-07, "loss": 0.0027, "reward": 1.8055160641670227, "reward_std": 0.046973712742328644, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8055160045623779, "step": 2615 }, { "clip_ratio": 0.0, "completion_length": 282.84375, "epoch": 1.27783203125, "grad_norm": 4.513029644722789, "kl": 0.0791015625, "learning_rate": 6.806640624999999e-07, "loss": 0.0032, "reward": 1.7537464499473572, "reward_std": 0.11804336681962013, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7615589499473572, "step": 2616 }, { "clip_ratio": 0.0, "completion_length": 309.4609375, "epoch": 1.2783203125, "grad_norm": 3.8194841411583798, "kl": 0.053466796875, "learning_rate": 6.805419921875e-07, "loss": 0.0021, "reward": 1.677466869354248, "reward_std": 0.056195804849267006, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6774668991565704, "step": 2617 }, { "clip_ratio": 0.0, "completion_length": 286.2265625, "epoch": 1.27880859375, "grad_norm": 6.474962292439623, "kl": 0.0562744140625, "learning_rate": 6.80419921875e-07, "loss": 0.0023, "reward": 1.8547720909118652, "reward_std": 0.025560058653354645, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8547720909118652, "step": 2618 }, { "clip_ratio": 0.0, "completion_length": 310.46875, "epoch": 1.279296875, "grad_norm": 1.4597995925565477, "kl": 0.0693359375, "learning_rate": 6.802978515625e-07, "loss": 0.0028, "reward": 1.6823553442955017, "reward_std": 0.1308056991547346, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7136052846908569, "step": 2619 }, { "clip_ratio": 0.0, "completion_length": 261.5078125, "epoch": 1.27978515625, "grad_norm": 0.6324777562183502, "kl": 0.052978515625, "learning_rate": 6.8017578125e-07, "loss": 0.0021, "reward": 1.6706210374832153, "reward_std": 0.05375996232032776, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6706210076808929, "step": 2620 }, { "clip_ratio": 0.0, "completion_length": 297.0078125, "epoch": 1.2802734375, "grad_norm": 1.2610636513041573, "kl": 0.0584716796875, "learning_rate": 6.800537109374999e-07, "loss": 0.0023, "reward": 1.7311273217201233, "reward_std": 0.192430280148983, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7545647621154785, "step": 2621 }, { "clip_ratio": 0.0, "completion_length": 193.671875, "epoch": 1.28076171875, "grad_norm": 1.678965103507942, "kl": 0.068603515625, "learning_rate": 6.799316406249999e-07, "loss": 0.0027, "reward": 1.7555344700813293, "reward_std": 0.048562128096818924, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7555344700813293, "step": 2622 }, { "clip_ratio": 0.0, "completion_length": 269.1875, "epoch": 1.28125, "grad_norm": 1.4249689682498725, "kl": 0.0643310546875, "learning_rate": 6.798095703125e-07, "loss": 0.0026, "reward": 1.7415629029273987, "reward_std": 0.07066285982728004, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7415629029273987, "step": 2623 }, { "clip_ratio": 0.0, "completion_length": 287.1640625, "epoch": 1.28173828125, "grad_norm": 1.3604919034116547, "kl": 0.0662841796875, "learning_rate": 6.796875e-07, "loss": 0.0027, "reward": 1.841383457183838, "reward_std": 0.037516459822654724, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8413834273815155, "step": 2624 }, { "clip_ratio": 0.0, "completion_length": 274.109375, "epoch": 1.2822265625, "grad_norm": 1.0918554991669291, "kl": 0.0423583984375, "learning_rate": 6.795654296875e-07, "loss": 0.0017, "reward": 1.810799479484558, "reward_std": 0.05421273224055767, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8107994794845581, "step": 2625 }, { "clip_ratio": 0.0, "completion_length": 241.109375, "epoch": 1.28271484375, "grad_norm": 0.9904225454583597, "kl": 0.06640625, "learning_rate": 6.79443359375e-07, "loss": 0.0027, "reward": 1.7526730298995972, "reward_std": 0.03807441703975201, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7526730000972748, "step": 2626 }, { "clip_ratio": 0.0, "completion_length": 342.765625, "epoch": 1.283203125, "grad_norm": 1.7982981484887894, "kl": 0.104248046875, "learning_rate": 6.793212890625e-07, "loss": 0.0042, "reward": 1.5381957292556763, "reward_std": 0.23323528468608856, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.5850707292556763, "step": 2627 }, { "clip_ratio": 0.0, "completion_length": 327.453125, "epoch": 1.28369140625, "grad_norm": 1.550548686839029, "kl": 0.085693359375, "learning_rate": 6.791992187499999e-07, "loss": 0.0034, "reward": 1.7918881177902222, "reward_std": 0.10955053754150867, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7997006177902222, "step": 2628 }, { "clip_ratio": 0.0, "completion_length": 297.5625, "epoch": 1.2841796875, "grad_norm": 1.6403304793757816, "kl": 0.0595703125, "learning_rate": 6.790771484374999e-07, "loss": 0.0024, "reward": 1.7768383026123047, "reward_std": 0.0630792174488306, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7768383026123047, "step": 2629 }, { "clip_ratio": 0.0, "completion_length": 299.09375, "epoch": 1.28466796875, "grad_norm": 2.733466306796843, "kl": 0.063232421875, "learning_rate": 6.78955078125e-07, "loss": 0.0025, "reward": 1.7490355968475342, "reward_std": 0.09908445179462433, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7568481266498566, "step": 2630 }, { "clip_ratio": 0.0, "completion_length": 295.4609375, "epoch": 1.28515625, "grad_norm": 0.5505330642135668, "kl": 0.05712890625, "learning_rate": 6.788330078125e-07, "loss": 0.0023, "reward": 1.785165786743164, "reward_std": 0.05979756236774847, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8007907271385193, "step": 2631 }, { "clip_ratio": 0.0, "completion_length": 249.1640625, "epoch": 1.28564453125, "grad_norm": 2.009225132396737, "kl": 0.075927734375, "learning_rate": 6.787109375e-07, "loss": 0.003, "reward": 1.7671828269958496, "reward_std": 0.06479834392666817, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7671828269958496, "step": 2632 }, { "clip_ratio": 0.0, "completion_length": 270.125, "epoch": 1.2861328125, "grad_norm": 1.7136252258972942, "kl": 0.0494384765625, "learning_rate": 6.785888671875e-07, "loss": 0.002, "reward": 1.8264079093933105, "reward_std": 0.017682873643934727, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8264078795909882, "step": 2633 }, { "clip_ratio": 0.0, "completion_length": 268.1640625, "epoch": 1.28662109375, "grad_norm": 3.778508106643716, "kl": 0.0667724609375, "learning_rate": 6.78466796875e-07, "loss": 0.0027, "reward": 1.8192695379257202, "reward_std": 0.055723583325743675, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.819269597530365, "step": 2634 }, { "clip_ratio": 0.0, "completion_length": 276.5625, "epoch": 1.287109375, "grad_norm": 2.0871013161582908, "kl": 0.068603515625, "learning_rate": 6.783447265624999e-07, "loss": 0.0027, "reward": 1.8692251443862915, "reward_std": 0.03455257322639227, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8770376443862915, "step": 2635 }, { "clip_ratio": 0.0, "completion_length": 341.796875, "epoch": 1.28759765625, "grad_norm": 1.9442360354470494, "kl": 0.0665283203125, "learning_rate": 6.7822265625e-07, "loss": 0.0027, "reward": 1.8144180178642273, "reward_std": 0.060984525829553604, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8144180476665497, "step": 2636 }, { "clip_ratio": 0.0, "completion_length": 318.0390625, "epoch": 1.2880859375, "grad_norm": 1.0365066351162422, "kl": 0.0621337890625, "learning_rate": 6.781005859375e-07, "loss": 0.0025, "reward": 1.7693456411361694, "reward_std": 0.0630057118833065, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7693456411361694, "step": 2637 }, { "clip_ratio": 0.0, "completion_length": 322.7578125, "epoch": 1.28857421875, "grad_norm": 2.8683486825278046, "kl": 0.056396484375, "learning_rate": 6.77978515625e-07, "loss": 0.0023, "reward": 1.7881279587745667, "reward_std": 0.058837566524744034, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7881280183792114, "step": 2638 }, { "clip_ratio": 0.0, "completion_length": 399.2734375, "epoch": 1.2890625, "grad_norm": 6.264612314146436, "kl": 0.060302734375, "learning_rate": 6.778564453125e-07, "loss": 0.0024, "reward": 1.6483284831047058, "reward_std": 0.13759692385792732, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.663953423500061, "step": 2639 }, { "clip_ratio": 0.0, "completion_length": 299.234375, "epoch": 1.28955078125, "grad_norm": 1.5759987187071773, "kl": 0.05615234375, "learning_rate": 6.77734375e-07, "loss": 0.0022, "reward": 1.8786953687667847, "reward_std": 0.024645724333822727, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8786953389644623, "step": 2640 }, { "clip_ratio": 0.0, "completion_length": 299.6328125, "epoch": 1.2900390625, "grad_norm": 2.6015249930837347, "kl": 0.088134765625, "learning_rate": 6.776123046874999e-07, "loss": 0.0035, "reward": 1.656754493713379, "reward_std": 0.07588385604321957, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6567544937133789, "step": 2641 }, { "clip_ratio": 0.0, "completion_length": 249.875, "epoch": 1.29052734375, "grad_norm": 1.3132376713397804, "kl": 0.0791015625, "learning_rate": 6.774902343749999e-07, "loss": 0.0032, "reward": 1.7957526445388794, "reward_std": 0.04471752420067787, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.795752614736557, "step": 2642 }, { "clip_ratio": 0.0, "completion_length": 332.828125, "epoch": 1.291015625, "grad_norm": 3.2946009869851753, "kl": 0.0601806640625, "learning_rate": 6.773681640625e-07, "loss": 0.0024, "reward": 1.650659203529358, "reward_std": 0.1262562870979309, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6897217035293579, "step": 2643 }, { "clip_ratio": 0.0, "completion_length": 305.078125, "epoch": 1.29150390625, "grad_norm": 3.1092957108787003, "kl": 0.0665283203125, "learning_rate": 6.7724609375e-07, "loss": 0.0027, "reward": 1.7754952907562256, "reward_std": 0.11444034799933434, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7989327907562256, "step": 2644 }, { "clip_ratio": 0.0, "completion_length": 277.40625, "epoch": 1.2919921875, "grad_norm": 12.661664724175285, "kl": 0.077392578125, "learning_rate": 6.771240234375e-07, "loss": 0.0031, "reward": 1.6563068628311157, "reward_std": 0.10409623384475708, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6563068330287933, "step": 2645 }, { "clip_ratio": 0.0, "completion_length": 381.8671875, "epoch": 1.29248046875, "grad_norm": 0.6055988273645182, "kl": 0.05224609375, "learning_rate": 6.77001953125e-07, "loss": 0.0021, "reward": 1.7593251466751099, "reward_std": 0.026515904814004898, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7593251466751099, "step": 2646 }, { "clip_ratio": 0.0, "completion_length": 368.5625, "epoch": 1.29296875, "grad_norm": 2.019699875883687, "kl": 0.06591796875, "learning_rate": 6.768798828125e-07, "loss": 0.0026, "reward": 1.6537410616874695, "reward_std": 0.19559639692306519, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.7162410914897919, "step": 2647 }, { "clip_ratio": 0.0, "completion_length": 278.78125, "epoch": 1.29345703125, "grad_norm": 3.7763404898754587, "kl": 0.068603515625, "learning_rate": 6.767578124999999e-07, "loss": 0.0027, "reward": 1.827435851097107, "reward_std": 0.044061899185180664, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8274357616901398, "step": 2648 }, { "clip_ratio": 0.0, "completion_length": 330.453125, "epoch": 1.2939453125, "grad_norm": 2.8015440153661784, "kl": 0.065185546875, "learning_rate": 6.766357421874999e-07, "loss": 0.0026, "reward": 1.6706737279891968, "reward_std": 0.18939045071601868, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7175487279891968, "step": 2649 }, { "clip_ratio": 0.0, "completion_length": 295.1484375, "epoch": 1.29443359375, "grad_norm": 1.9989463705585409, "kl": 0.078125, "learning_rate": 6.76513671875e-07, "loss": 0.0031, "reward": 1.7752625942230225, "reward_std": 0.06523648090660572, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7752625942230225, "step": 2650 }, { "clip_ratio": 0.0, "completion_length": 298.7578125, "epoch": 1.294921875, "grad_norm": 5.2627762529444055, "kl": 0.0523681640625, "learning_rate": 6.763916015625e-07, "loss": 0.0021, "reward": 1.7596194744110107, "reward_std": 0.05819558724761009, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7596194744110107, "step": 2651 }, { "clip_ratio": 0.0, "completion_length": 385.1875, "epoch": 1.29541015625, "grad_norm": 1.193829150298933, "kl": 0.054443359375, "learning_rate": 6.7626953125e-07, "loss": 0.0022, "reward": 1.7631536722183228, "reward_std": 0.11396730691194534, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.8022161424160004, "step": 2652 }, { "clip_ratio": 0.0, "completion_length": 410.75, "epoch": 1.2958984375, "grad_norm": 4.511945830839449, "kl": 0.0693359375, "learning_rate": 6.761474609375e-07, "loss": 0.0028, "reward": 1.6372665762901306, "reward_std": 0.12859837338328362, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6763290762901306, "step": 2653 }, { "clip_ratio": 0.0, "completion_length": 297.015625, "epoch": 1.29638671875, "grad_norm": 7.337066580487183, "kl": 0.088134765625, "learning_rate": 6.760253906249999e-07, "loss": 0.0035, "reward": 1.770021915435791, "reward_std": 0.0901176705956459, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.770021915435791, "step": 2654 }, { "clip_ratio": 0.0, "completion_length": 334.65625, "epoch": 1.296875, "grad_norm": 1.7138139340851553, "kl": 0.054443359375, "learning_rate": 6.759033203124999e-07, "loss": 0.0022, "reward": 1.6479786038398743, "reward_std": 0.055697097443044186, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7026660740375519, "step": 2655 }, { "clip_ratio": 0.0, "completion_length": 350.328125, "epoch": 1.29736328125, "grad_norm": 1.157243381436146, "kl": 0.067138671875, "learning_rate": 6.7578125e-07, "loss": 0.0027, "reward": 1.7617986798286438, "reward_std": 0.12362072244286537, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.8008612096309662, "step": 2656 }, { "clip_ratio": 0.0, "completion_length": 302.4140625, "epoch": 1.2978515625, "grad_norm": 12.730576035610571, "kl": 0.070556640625, "learning_rate": 6.756591796875e-07, "loss": 0.0028, "reward": 1.6338982582092285, "reward_std": 0.0442785257473588, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6338982731103897, "step": 2657 }, { "clip_ratio": 0.0, "completion_length": 270.1015625, "epoch": 1.29833984375, "grad_norm": 1.0874774416423236, "kl": 0.06689453125, "learning_rate": 6.75537109375e-07, "loss": 0.0027, "reward": 1.7247290015220642, "reward_std": 0.06158460769802332, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7247289717197418, "step": 2658 }, { "clip_ratio": 0.0, "completion_length": 276.2109375, "epoch": 1.298828125, "grad_norm": 1.0818257630262431, "kl": 0.0645751953125, "learning_rate": 6.754150390625e-07, "loss": 0.0026, "reward": 1.8425450325012207, "reward_std": 0.10945501737296581, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8503575325012207, "step": 2659 }, { "clip_ratio": 0.0, "completion_length": 284.0703125, "epoch": 1.29931640625, "grad_norm": 14.407404536821112, "kl": 0.0506591796875, "learning_rate": 6.7529296875e-07, "loss": 0.002, "reward": 1.8501919507980347, "reward_std": 0.09266996011137962, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8580044806003571, "step": 2660 }, { "clip_ratio": 0.0, "completion_length": 292.46875, "epoch": 1.2998046875, "grad_norm": 0.8514788037499841, "kl": 0.052001953125, "learning_rate": 6.751708984374999e-07, "loss": 0.0021, "reward": 1.7188128232955933, "reward_std": 0.07580004632472992, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7266253232955933, "step": 2661 }, { "clip_ratio": 0.0, "completion_length": 351.25, "epoch": 1.30029296875, "grad_norm": 0.7970795376599247, "kl": 0.066650390625, "learning_rate": 6.750488281249999e-07, "loss": 0.0027, "reward": 1.7323698997497559, "reward_std": 0.11393286287784576, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7870573997497559, "step": 2662 }, { "clip_ratio": 0.0, "completion_length": 337.4921875, "epoch": 1.30078125, "grad_norm": 2.1897551324280338, "kl": 0.07568359375, "learning_rate": 6.749267578125e-07, "loss": 0.003, "reward": 1.8059039115905762, "reward_std": 0.060338267125189304, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8059038519859314, "step": 2663 }, { "clip_ratio": 0.0, "completion_length": 271.0390625, "epoch": 1.30126953125, "grad_norm": 2.9581165989544824, "kl": 0.064453125, "learning_rate": 6.748046875e-07, "loss": 0.0026, "reward": 1.7930954098701477, "reward_std": 0.061221227049827576, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7930954694747925, "step": 2664 }, { "clip_ratio": 0.0, "completion_length": 319.15625, "epoch": 1.3017578125, "grad_norm": 1.6499047047869468, "kl": 0.0540771484375, "learning_rate": 6.746826171875e-07, "loss": 0.0022, "reward": 1.8659427165985107, "reward_std": 0.06026996113359928, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8659427464008331, "step": 2665 }, { "clip_ratio": 0.0, "completion_length": 208.0625, "epoch": 1.30224609375, "grad_norm": 1.0955739275158136, "kl": 0.07861328125, "learning_rate": 6.74560546875e-07, "loss": 0.0031, "reward": 1.6215183734893799, "reward_std": 0.05073964595794678, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6215183138847351, "step": 2666 }, { "clip_ratio": 0.0, "completion_length": 397.03125, "epoch": 1.302734375, "grad_norm": 1.8339905299535653, "kl": 0.0643310546875, "learning_rate": 6.744384765624999e-07, "loss": 0.0026, "reward": 1.716743528842926, "reward_std": 0.10541088692843914, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.740181028842926, "step": 2667 }, { "clip_ratio": 0.0, "completion_length": 253.1640625, "epoch": 1.30322265625, "grad_norm": 1.3647709905403491, "kl": 0.086181640625, "learning_rate": 6.743164062499999e-07, "loss": 0.0034, "reward": 1.7058016657829285, "reward_std": 0.029076790437102318, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7058016955852509, "step": 2668 }, { "clip_ratio": 0.0, "completion_length": 406.59375, "epoch": 1.3037109375, "grad_norm": 1.3669269567191786, "kl": 0.044189453125, "learning_rate": 6.741943359375e-07, "loss": 0.0018, "reward": 1.672289490699768, "reward_std": 0.13343248516321182, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7113519906997681, "step": 2669 }, { "clip_ratio": 0.0, "completion_length": 327.0390625, "epoch": 1.30419921875, "grad_norm": 0.6783532224856341, "kl": 0.053955078125, "learning_rate": 6.74072265625e-07, "loss": 0.0022, "reward": 1.7751038670539856, "reward_std": 0.0778466984629631, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7985413074493408, "step": 2670 }, { "clip_ratio": 0.0, "completion_length": 264.03125, "epoch": 1.3046875, "grad_norm": 4.320924526476853, "kl": 0.072021484375, "learning_rate": 6.739501953125e-07, "loss": 0.0029, "reward": 1.6213982105255127, "reward_std": 0.0660770833492279, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6213981807231903, "step": 2671 }, { "clip_ratio": 0.0, "completion_length": 318.609375, "epoch": 1.30517578125, "grad_norm": 1.4258907778375465, "kl": 0.061279296875, "learning_rate": 6.73828125e-07, "loss": 0.0024, "reward": 1.8601597547531128, "reward_std": 0.11782187595963478, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8679722547531128, "step": 2672 }, { "clip_ratio": 0.0, "completion_length": 267.890625, "epoch": 1.3056640625, "grad_norm": 1.724437541506563, "kl": 0.06787109375, "learning_rate": 6.737060546875e-07, "loss": 0.0027, "reward": 1.786941945552826, "reward_std": 0.0738075040280819, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7869419753551483, "step": 2673 }, { "clip_ratio": 0.0, "completion_length": 391.8671875, "epoch": 1.30615234375, "grad_norm": 1.0501594228457007, "kl": 0.062255859375, "learning_rate": 6.735839843749999e-07, "loss": 0.0025, "reward": 1.762403666973114, "reward_std": 0.05871861148625612, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7702161371707916, "step": 2674 }, { "clip_ratio": 0.0, "completion_length": 327.015625, "epoch": 1.306640625, "grad_norm": 2.8131149989049526, "kl": 0.069580078125, "learning_rate": 6.734619140624999e-07, "loss": 0.0028, "reward": 1.651845395565033, "reward_std": 0.13027137517929077, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.667470395565033, "step": 2675 }, { "clip_ratio": 0.0, "completion_length": 337.078125, "epoch": 1.30712890625, "grad_norm": 3.330367088072933, "kl": 0.086181640625, "learning_rate": 6.7333984375e-07, "loss": 0.0034, "reward": 1.806040346622467, "reward_std": 0.043943583965301514, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8060402572154999, "step": 2676 }, { "clip_ratio": 0.0, "completion_length": 297.9609375, "epoch": 1.3076171875, "grad_norm": 10.765893801480635, "kl": 0.063720703125, "learning_rate": 6.732177734375e-07, "loss": 0.0026, "reward": 1.7391871809959412, "reward_std": 0.05080571398139, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7391871213912964, "step": 2677 }, { "clip_ratio": 0.0, "completion_length": 262.3203125, "epoch": 1.30810546875, "grad_norm": 1.4238084568541345, "kl": 0.0606689453125, "learning_rate": 6.73095703125e-07, "loss": 0.0024, "reward": 1.8248883485794067, "reward_std": 0.037258436903357506, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8248883485794067, "step": 2678 }, { "clip_ratio": 0.0, "completion_length": 276.3046875, "epoch": 1.30859375, "grad_norm": 1.4907117084185106, "kl": 0.0635986328125, "learning_rate": 6.729736328125e-07, "loss": 0.0025, "reward": 1.7909139394760132, "reward_std": 0.06092044711112976, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7909139692783356, "step": 2679 }, { "clip_ratio": 0.0, "completion_length": 277.5703125, "epoch": 1.30908203125, "grad_norm": 2.0791465208702387, "kl": 0.0679931640625, "learning_rate": 6.728515624999999e-07, "loss": 0.0027, "reward": 1.7821356058120728, "reward_std": 0.06090878788381815, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7821356058120728, "step": 2680 }, { "clip_ratio": 0.0, "completion_length": 251.0859375, "epoch": 1.3095703125, "grad_norm": 1.4915135016440937, "kl": 0.0634765625, "learning_rate": 6.727294921874999e-07, "loss": 0.0025, "reward": 1.8791195154190063, "reward_std": 0.02637836430221796, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8791195452213287, "step": 2681 }, { "clip_ratio": 0.0, "completion_length": 273.921875, "epoch": 1.31005859375, "grad_norm": 2.3070951051177553, "kl": 0.076171875, "learning_rate": 6.72607421875e-07, "loss": 0.003, "reward": 1.757739543914795, "reward_std": 0.06817848235368729, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7577394843101501, "step": 2682 }, { "clip_ratio": 0.0, "completion_length": 255.5234375, "epoch": 1.310546875, "grad_norm": 1.4622842311116624, "kl": 0.0673828125, "learning_rate": 6.724853515625e-07, "loss": 0.0027, "reward": 1.7414976358413696, "reward_std": 0.022878904826939106, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7414976358413696, "step": 2683 }, { "clip_ratio": 0.0, "completion_length": 279.828125, "epoch": 1.31103515625, "grad_norm": 1.450865157370279, "kl": 0.058349609375, "learning_rate": 6.7236328125e-07, "loss": 0.0023, "reward": 1.7220868468284607, "reward_std": 0.045497006736695766, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7220868170261383, "step": 2684 }, { "clip_ratio": 0.0, "completion_length": 354.9375, "epoch": 1.3115234375, "grad_norm": 4.128346398936986, "kl": 0.142822265625, "learning_rate": 6.722412109375e-07, "loss": 0.0057, "reward": 1.8581845164299011, "reward_std": 0.0643857903778553, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8581845462322235, "step": 2685 }, { "clip_ratio": 0.0, "completion_length": 351.1171875, "epoch": 1.31201171875, "grad_norm": 2.486707810004422, "kl": 0.0672607421875, "learning_rate": 6.72119140625e-07, "loss": 0.0027, "reward": 1.81499183177948, "reward_std": 0.04949922952800989, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.81499183177948, "step": 2686 }, { "clip_ratio": 0.0, "completion_length": 278.03125, "epoch": 1.3125, "grad_norm": 1.7415850172344802, "kl": 0.079345703125, "learning_rate": 6.719970703124999e-07, "loss": 0.0032, "reward": 1.8236759305000305, "reward_std": 0.038199277594685555, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8236759305000305, "step": 2687 }, { "clip_ratio": 0.0, "completion_length": 275.2890625, "epoch": 1.31298828125, "grad_norm": 4.382732157563532, "kl": 0.07080078125, "learning_rate": 6.718749999999999e-07, "loss": 0.0028, "reward": 1.7863699793815613, "reward_std": 0.09564121440052986, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.786370038986206, "step": 2688 }, { "clip_ratio": 0.0, "completion_length": 268.421875, "epoch": 1.3134765625, "grad_norm": 1.9957093212811292, "kl": 0.06494140625, "learning_rate": 6.717529296875e-07, "loss": 0.0026, "reward": 1.729765772819519, "reward_std": 0.058095003478229046, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.729765772819519, "step": 2689 }, { "clip_ratio": 0.0, "completion_length": 289.09375, "epoch": 1.31396484375, "grad_norm": 1.5829270533122184, "kl": 0.05908203125, "learning_rate": 6.71630859375e-07, "loss": 0.0024, "reward": 1.7915682792663574, "reward_std": 0.06929146684706211, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7915682792663574, "step": 2690 }, { "clip_ratio": 0.0, "completion_length": 317.1015625, "epoch": 1.314453125, "grad_norm": 0.9886014203631965, "kl": 0.06201171875, "learning_rate": 6.715087890625e-07, "loss": 0.0025, "reward": 1.7871447801589966, "reward_std": 0.08929637633264065, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.810582309961319, "step": 2691 }, { "clip_ratio": 0.0, "completion_length": 248.1640625, "epoch": 1.31494140625, "grad_norm": 1.0123772399132263, "kl": 0.072509765625, "learning_rate": 6.7138671875e-07, "loss": 0.0029, "reward": 1.6426368355751038, "reward_std": 0.0795272197574377, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.642636775970459, "step": 2692 }, { "clip_ratio": 0.0, "completion_length": 259.0, "epoch": 1.3154296875, "grad_norm": 0.8408158657981815, "kl": 0.0609130859375, "learning_rate": 6.712646484374999e-07, "loss": 0.0024, "reward": 1.782248616218567, "reward_std": 0.026081452146172523, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7822486758232117, "step": 2693 }, { "clip_ratio": 0.0, "completion_length": 273.109375, "epoch": 1.31591796875, "grad_norm": 1.1570138238531078, "kl": 0.068359375, "learning_rate": 6.711425781249999e-07, "loss": 0.0027, "reward": 1.9411388635635376, "reward_std": 0.07927910797297955, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.941138744354248, "step": 2694 }, { "clip_ratio": 0.0, "completion_length": 261.75, "epoch": 1.31640625, "grad_norm": 6.129173902050601, "kl": 0.0728759765625, "learning_rate": 6.710205078125e-07, "loss": 0.0029, "reward": 1.7619558572769165, "reward_std": 0.060450656339526176, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7619557976722717, "step": 2695 }, { "clip_ratio": 0.0, "completion_length": 330.09375, "epoch": 1.31689453125, "grad_norm": 2.0134898217947166, "kl": 0.07568359375, "learning_rate": 6.708984375e-07, "loss": 0.003, "reward": 1.7178034782409668, "reward_std": 0.08819794841110706, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7256160378456116, "step": 2696 }, { "clip_ratio": 0.0, "completion_length": 271.65625, "epoch": 1.3173828125, "grad_norm": 2.2035043252857203, "kl": 0.075927734375, "learning_rate": 6.707763671875e-07, "loss": 0.003, "reward": 1.7355281710624695, "reward_std": 0.13643942587077618, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7667781114578247, "step": 2697 }, { "clip_ratio": 0.0, "completion_length": 287.8359375, "epoch": 1.31787109375, "grad_norm": 1.556741176565315, "kl": 0.0655517578125, "learning_rate": 6.70654296875e-07, "loss": 0.0026, "reward": 1.8553110361099243, "reward_std": 0.14732931554317474, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8553110957145691, "step": 2698 }, { "clip_ratio": 0.0, "completion_length": 264.515625, "epoch": 1.318359375, "grad_norm": 0.8254256278213256, "kl": 0.0787353515625, "learning_rate": 6.705322265625e-07, "loss": 0.0032, "reward": 1.6898934841156006, "reward_std": 0.02993142046034336, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6898934543132782, "step": 2699 }, { "clip_ratio": 0.0, "completion_length": 261.0234375, "epoch": 1.31884765625, "grad_norm": 1.8226982226518016, "kl": 0.0614013671875, "learning_rate": 6.704101562499999e-07, "loss": 0.0025, "reward": 1.717129111289978, "reward_std": 0.03285204339772463, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.717129111289978, "step": 2700 }, { "clip_ratio": 0.0, "completion_length": 243.8984375, "epoch": 1.3193359375, "grad_norm": 12.341387634183427, "kl": 0.076416015625, "learning_rate": 6.702880859374999e-07, "loss": 0.0031, "reward": 1.725698471069336, "reward_std": 0.06217564269900322, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7256983816623688, "step": 2701 }, { "clip_ratio": 0.0, "completion_length": 305.1875, "epoch": 1.31982421875, "grad_norm": 0.9525369914253854, "kl": 0.060546875, "learning_rate": 6.70166015625e-07, "loss": 0.0024, "reward": 1.8281516432762146, "reward_std": 0.03971204720437527, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8281517326831818, "step": 2702 }, { "clip_ratio": 0.0, "completion_length": 361.1015625, "epoch": 1.3203125, "grad_norm": 1.2358295612242862, "kl": 0.072265625, "learning_rate": 6.700439453125e-07, "loss": 0.0029, "reward": 1.8739069700241089, "reward_std": 0.04518134891986847, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8739069700241089, "step": 2703 }, { "clip_ratio": 0.0, "completion_length": 206.8359375, "epoch": 1.32080078125, "grad_norm": 1.6234402625579745, "kl": 0.077880859375, "learning_rate": 6.69921875e-07, "loss": 0.0031, "reward": 1.6680772304534912, "reward_std": 0.03605229314416647, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6680772304534912, "step": 2704 }, { "clip_ratio": 0.0, "completion_length": 263.234375, "epoch": 1.3212890625, "grad_norm": 1.019102997793149, "kl": 0.08935546875, "learning_rate": 6.697998046875e-07, "loss": 0.0036, "reward": 1.7389346361160278, "reward_std": 0.04351983033120632, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7389346957206726, "step": 2705 }, { "clip_ratio": 0.0, "completion_length": 240.5, "epoch": 1.32177734375, "grad_norm": 2.8813575124650197, "kl": 0.07177734375, "learning_rate": 6.696777343749999e-07, "loss": 0.0029, "reward": 1.7355643510818481, "reward_std": 0.03399805910885334, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7355643510818481, "step": 2706 }, { "clip_ratio": 0.0, "completion_length": 258.9765625, "epoch": 1.322265625, "grad_norm": 2.810469992608748, "kl": 0.069091796875, "learning_rate": 6.695556640624999e-07, "loss": 0.0028, "reward": 1.841668725013733, "reward_std": 0.06313896924257278, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8416686952114105, "step": 2707 }, { "clip_ratio": 0.0, "completion_length": 197.8515625, "epoch": 1.32275390625, "grad_norm": 1.4321682955957, "kl": 0.054443359375, "learning_rate": 6.6943359375e-07, "loss": 0.0022, "reward": 1.8613044619560242, "reward_std": 0.023028030525892973, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8613044023513794, "step": 2708 }, { "clip_ratio": 0.0, "completion_length": 278.03125, "epoch": 1.3232421875, "grad_norm": 1.1730734025210117, "kl": 0.064453125, "learning_rate": 6.693115234375e-07, "loss": 0.0026, "reward": 1.7823152542114258, "reward_std": 0.09651216119527817, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7979402244091034, "step": 2709 }, { "clip_ratio": 0.0, "completion_length": 205.28125, "epoch": 1.32373046875, "grad_norm": 8.059925351400762, "kl": 0.09912109375, "learning_rate": 6.69189453125e-07, "loss": 0.004, "reward": 1.7996364831924438, "reward_std": 0.07981680566444993, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8152614533901215, "step": 2710 }, { "clip_ratio": 0.0, "completion_length": 248.3046875, "epoch": 1.32421875, "grad_norm": 1.3591185961038974, "kl": 0.082275390625, "learning_rate": 6.690673828125e-07, "loss": 0.0033, "reward": 1.7113600969314575, "reward_std": 0.042755890637636185, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7113600373268127, "step": 2711 }, { "clip_ratio": 0.0, "completion_length": 340.7265625, "epoch": 1.32470703125, "grad_norm": 2.8367504069887737, "kl": 0.0732421875, "learning_rate": 6.689453125e-07, "loss": 0.0029, "reward": 1.7709915041923523, "reward_std": 0.04804490879178047, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7709915339946747, "step": 2712 }, { "clip_ratio": 0.0, "completion_length": 262.34375, "epoch": 1.3251953125, "grad_norm": 2.423950195941969, "kl": 0.048828125, "learning_rate": 6.688232421874999e-07, "loss": 0.002, "reward": 1.8260767459869385, "reward_std": 0.054210664704442024, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8260767161846161, "step": 2713 }, { "clip_ratio": 0.0, "completion_length": 322.8671875, "epoch": 1.32568359375, "grad_norm": 1.2894446032385674, "kl": 0.0791015625, "learning_rate": 6.687011718749999e-07, "loss": 0.0032, "reward": 1.7933790683746338, "reward_std": 0.048833588138222694, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7933790981769562, "step": 2714 }, { "clip_ratio": 0.0, "completion_length": 337.09375, "epoch": 1.326171875, "grad_norm": 5.206795373840309, "kl": 0.067626953125, "learning_rate": 6.685791015625e-07, "loss": 0.0027, "reward": 1.712072491645813, "reward_std": 0.08722497709095478, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7120724618434906, "step": 2715 }, { "clip_ratio": 0.0, "completion_length": 287.890625, "epoch": 1.32666015625, "grad_norm": 1.567961181256668, "kl": 0.060546875, "learning_rate": 6.6845703125e-07, "loss": 0.0024, "reward": 1.7127328515052795, "reward_std": 0.05422433838248253, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7127328813076019, "step": 2716 }, { "clip_ratio": 0.0, "completion_length": 223.25, "epoch": 1.3271484375, "grad_norm": 1.3532097514243422, "kl": 0.0728759765625, "learning_rate": 6.683349609375e-07, "loss": 0.0029, "reward": 1.730715036392212, "reward_std": 0.04247327148914337, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7307150363922119, "step": 2717 }, { "clip_ratio": 0.0, "completion_length": 289.703125, "epoch": 1.32763671875, "grad_norm": 2.413627159184983, "kl": 0.08154296875, "learning_rate": 6.68212890625e-07, "loss": 0.0033, "reward": 1.7469477653503418, "reward_std": 0.15270064398646355, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7703852355480194, "step": 2718 }, { "clip_ratio": 0.0, "completion_length": 379.9375, "epoch": 1.328125, "grad_norm": 6.093449713291131, "kl": 0.0732421875, "learning_rate": 6.680908203125e-07, "loss": 0.0029, "reward": 1.7299081683158875, "reward_std": 0.0472866240888834, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7299081385135651, "step": 2719 }, { "clip_ratio": 0.0, "completion_length": 232.8984375, "epoch": 1.32861328125, "grad_norm": 4.912267833376837, "kl": 0.078125, "learning_rate": 6.679687499999999e-07, "loss": 0.0031, "reward": 1.8125880360603333, "reward_std": 0.06944674998521805, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8125880360603333, "step": 2720 }, { "clip_ratio": 0.0, "completion_length": 225.7890625, "epoch": 1.3291015625, "grad_norm": 1.4881380474177748, "kl": 0.062744140625, "learning_rate": 6.678466796875e-07, "loss": 0.0025, "reward": 1.7330250144004822, "reward_std": 0.08363675326108932, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7408375144004822, "step": 2721 }, { "clip_ratio": 0.0, "completion_length": 223.390625, "epoch": 1.32958984375, "grad_norm": 1.4668229727045794, "kl": 0.07763671875, "learning_rate": 6.67724609375e-07, "loss": 0.0031, "reward": 1.805801808834076, "reward_std": 0.06017332337796688, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8058017492294312, "step": 2722 }, { "clip_ratio": 0.0, "completion_length": 315.0078125, "epoch": 1.330078125, "grad_norm": 2.118197040137688, "kl": 0.058349609375, "learning_rate": 6.676025390625e-07, "loss": 0.0023, "reward": 1.778078854084015, "reward_std": 0.09711121767759323, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8015162944793701, "step": 2723 }, { "clip_ratio": 0.0, "completion_length": 233.4921875, "epoch": 1.33056640625, "grad_norm": 0.7593383155856351, "kl": 0.083984375, "learning_rate": 6.6748046875e-07, "loss": 0.0034, "reward": 1.5734055638313293, "reward_std": 0.08365354500710964, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.5890305191278458, "step": 2724 }, { "clip_ratio": 0.0, "completion_length": 256.7421875, "epoch": 1.3310546875, "grad_norm": 14.803652390678243, "kl": 0.075439453125, "learning_rate": 6.673583984375e-07, "loss": 0.003, "reward": 1.8133496046066284, "reward_std": 0.01667138608172536, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8133496046066284, "step": 2725 }, { "clip_ratio": 0.0, "completion_length": 345.0703125, "epoch": 1.33154296875, "grad_norm": 2.120863946435663, "kl": 0.070068359375, "learning_rate": 6.672363281249999e-07, "loss": 0.0028, "reward": 1.6798765659332275, "reward_std": 0.12814366817474365, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6876890957355499, "step": 2726 }, { "clip_ratio": 0.0, "completion_length": 285.1484375, "epoch": 1.33203125, "grad_norm": 0.7789537405347509, "kl": 0.068603515625, "learning_rate": 6.671142578124999e-07, "loss": 0.0027, "reward": 1.7501285672187805, "reward_std": 0.02368486486375332, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7501285076141357, "step": 2727 }, { "clip_ratio": 0.0, "completion_length": 304.7265625, "epoch": 1.33251953125, "grad_norm": 1.6295888889315164, "kl": 0.075927734375, "learning_rate": 6.669921875e-07, "loss": 0.003, "reward": 1.6839573979377747, "reward_std": 0.04334849305450916, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6839573085308075, "step": 2728 }, { "clip_ratio": 0.0, "completion_length": 257.875, "epoch": 1.3330078125, "grad_norm": 1.6509903224943192, "kl": 0.0718994140625, "learning_rate": 6.668701171875e-07, "loss": 0.0029, "reward": 1.8505354523658752, "reward_std": 0.033067792654037476, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8505354523658752, "step": 2729 }, { "clip_ratio": 0.0, "completion_length": 265.421875, "epoch": 1.33349609375, "grad_norm": 0.7796283131638133, "kl": 0.08837890625, "learning_rate": 6.66748046875e-07, "loss": 0.0035, "reward": 1.7302683591842651, "reward_std": 0.06862842850387096, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7380808293819427, "step": 2730 }, { "clip_ratio": 0.0, "completion_length": 259.4453125, "epoch": 1.333984375, "grad_norm": 1.4712873612302482, "kl": 0.05859375, "learning_rate": 6.666259765625e-07, "loss": 0.0023, "reward": 1.8550618886947632, "reward_std": 0.09110767394304276, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8550618886947632, "step": 2731 }, { "clip_ratio": 0.0, "completion_length": 299.4140625, "epoch": 1.33447265625, "grad_norm": 3.881608704348827, "kl": 0.085693359375, "learning_rate": 6.6650390625e-07, "loss": 0.0034, "reward": 1.7282916903495789, "reward_std": 0.10343683697283268, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7517292201519012, "step": 2732 }, { "clip_ratio": 0.0, "completion_length": 213.9921875, "epoch": 1.3349609375, "grad_norm": 1.0148415053766122, "kl": 0.069091796875, "learning_rate": 6.663818359374999e-07, "loss": 0.0028, "reward": 1.8618406057357788, "reward_std": 0.024386493489146233, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8618406653404236, "step": 2733 }, { "clip_ratio": 0.0, "completion_length": 306.609375, "epoch": 1.33544921875, "grad_norm": 1.0975253206002726, "kl": 0.07275390625, "learning_rate": 6.66259765625e-07, "loss": 0.0029, "reward": 1.7836071848869324, "reward_std": 0.0855344720184803, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7914197146892548, "step": 2734 }, { "clip_ratio": 0.0, "completion_length": 320.921875, "epoch": 1.3359375, "grad_norm": 2.3233855799239578, "kl": 0.069580078125, "learning_rate": 6.661376953125e-07, "loss": 0.0028, "reward": 1.7295535802841187, "reward_std": 0.08530437387526035, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7529910504817963, "step": 2735 }, { "clip_ratio": 0.0, "completion_length": 249.0625, "epoch": 1.33642578125, "grad_norm": 0.9699266686728515, "kl": 0.077392578125, "learning_rate": 6.66015625e-07, "loss": 0.0031, "reward": 1.841222882270813, "reward_std": 0.0592461503110826, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.841222882270813, "step": 2736 }, { "clip_ratio": 0.0, "completion_length": 306.046875, "epoch": 1.3369140625, "grad_norm": 2.7201637371137823, "kl": 0.092529296875, "learning_rate": 6.658935546875e-07, "loss": 0.0037, "reward": 1.7343988418579102, "reward_std": 0.1262606419622898, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7656488418579102, "step": 2737 }, { "clip_ratio": 0.0, "completion_length": 326.6875, "epoch": 1.33740234375, "grad_norm": 2.8247996935464554, "kl": 0.0908203125, "learning_rate": 6.65771484375e-07, "loss": 0.0036, "reward": 1.7593631744384766, "reward_std": 0.03225879417732358, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7593631744384766, "step": 2738 }, { "clip_ratio": 0.0, "completion_length": 308.5, "epoch": 1.337890625, "grad_norm": 1.160437436072484, "kl": 0.0732421875, "learning_rate": 6.656494140624999e-07, "loss": 0.0029, "reward": 1.7741823196411133, "reward_std": 0.047216689214110374, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7741822898387909, "step": 2739 }, { "clip_ratio": 0.0, "completion_length": 284.953125, "epoch": 1.33837890625, "grad_norm": 2.843629986590572, "kl": 0.07861328125, "learning_rate": 6.655273437499999e-07, "loss": 0.0031, "reward": 1.694653332233429, "reward_std": 0.07069635391235352, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6946533024311066, "step": 2740 }, { "clip_ratio": 0.0, "completion_length": 220.390625, "epoch": 1.3388671875, "grad_norm": 3.22653638503277, "kl": 0.0821533203125, "learning_rate": 6.654052734375e-07, "loss": 0.0033, "reward": 1.834531307220459, "reward_std": 0.0343925547786057, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8345312476158142, "step": 2741 }, { "clip_ratio": 0.0, "completion_length": 313.625, "epoch": 1.33935546875, "grad_norm": 5.463175138878789, "kl": 0.062744140625, "learning_rate": 6.65283203125e-07, "loss": 0.0025, "reward": 1.814225673675537, "reward_std": 0.024931567488238215, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8142256438732147, "step": 2742 }, { "clip_ratio": 0.0, "completion_length": 287.9765625, "epoch": 1.33984375, "grad_norm": 5.997176370030234, "kl": 0.07666015625, "learning_rate": 6.651611328125e-07, "loss": 0.0031, "reward": 1.778179109096527, "reward_std": 0.08071616850793362, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7781790792942047, "step": 2743 }, { "clip_ratio": 0.0, "completion_length": 201.953125, "epoch": 1.34033203125, "grad_norm": 2.1108738481028824, "kl": 0.0687255859375, "learning_rate": 6.650390625e-07, "loss": 0.0027, "reward": 1.6546780467033386, "reward_std": 0.04719951003789902, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6546780467033386, "step": 2744 }, { "clip_ratio": 0.0, "completion_length": 296.859375, "epoch": 1.3408203125, "grad_norm": 1.189954861287409, "kl": 0.077880859375, "learning_rate": 6.649169921875e-07, "loss": 0.0031, "reward": 1.722962737083435, "reward_std": 0.10328607633709908, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7385877668857574, "step": 2745 }, { "clip_ratio": 0.0, "completion_length": 246.359375, "epoch": 1.34130859375, "grad_norm": 2.3195408485561986, "kl": 0.078369140625, "learning_rate": 6.647949218749999e-07, "loss": 0.0031, "reward": 1.7592060565948486, "reward_std": 0.03395948093384504, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.759206086397171, "step": 2746 }, { "clip_ratio": 0.0, "completion_length": 302.640625, "epoch": 1.341796875, "grad_norm": 0.762429658572133, "kl": 0.06884765625, "learning_rate": 6.646728515625e-07, "loss": 0.0028, "reward": 1.813932180404663, "reward_std": 0.09360839053988457, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8373695611953735, "step": 2747 }, { "clip_ratio": 0.0, "completion_length": 224.75, "epoch": 1.34228515625, "grad_norm": 3.539568556920886, "kl": 0.0634765625, "learning_rate": 6.6455078125e-07, "loss": 0.0025, "reward": 1.750356376171112, "reward_std": 0.049200138077139854, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7503563463687897, "step": 2748 }, { "clip_ratio": 0.0, "completion_length": 268.140625, "epoch": 1.3427734375, "grad_norm": 3.9639963865650247, "kl": 0.0772705078125, "learning_rate": 6.644287109375e-07, "loss": 0.0031, "reward": 1.6544407606124878, "reward_std": 0.07485324889421463, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6544407606124878, "step": 2749 }, { "clip_ratio": 0.0, "completion_length": 325.546875, "epoch": 1.34326171875, "grad_norm": 7.157519189741742, "kl": 0.0570068359375, "learning_rate": 6.64306640625e-07, "loss": 0.0023, "reward": 1.7769380807876587, "reward_std": 0.12172123789787292, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7925631105899811, "step": 2750 }, { "clip_ratio": 0.0, "completion_length": 381.296875, "epoch": 1.34375, "grad_norm": 1.380768515593198, "kl": 0.053466796875, "learning_rate": 6.641845703125e-07, "loss": 0.0021, "reward": 1.6924372911453247, "reward_std": 0.11786646395921707, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7314998209476471, "step": 2751 }, { "clip_ratio": 0.0, "completion_length": 353.1875, "epoch": 1.34423828125, "grad_norm": 2.166002705894626, "kl": 0.0631103515625, "learning_rate": 6.640624999999999e-07, "loss": 0.0025, "reward": 1.8119662404060364, "reward_std": 0.0506830308586359, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8119662702083588, "step": 2752 }, { "clip_ratio": 0.0, "completion_length": 300.6875, "epoch": 1.3447265625, "grad_norm": 24.943229867842106, "kl": 0.0791015625, "learning_rate": 6.639404296874999e-07, "loss": 0.0032, "reward": 1.8568952083587646, "reward_std": 0.04123697895556688, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8568951487541199, "step": 2753 }, { "clip_ratio": 0.0, "completion_length": 279.03125, "epoch": 1.34521484375, "grad_norm": 1.8708805947768723, "kl": 0.090576171875, "learning_rate": 6.63818359375e-07, "loss": 0.0036, "reward": 1.7188897132873535, "reward_std": 0.056853797286748886, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7188896536827087, "step": 2754 }, { "clip_ratio": 0.0, "completion_length": 315.5078125, "epoch": 1.345703125, "grad_norm": 2.439239557617475, "kl": 0.0806884765625, "learning_rate": 6.636962890625e-07, "loss": 0.0032, "reward": 1.7047572135925293, "reward_std": 0.18556798994541168, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7438197731971741, "step": 2755 }, { "clip_ratio": 0.0, "completion_length": 354.171875, "epoch": 1.34619140625, "grad_norm": 2.160190526404616, "kl": 0.0657958984375, "learning_rate": 6.6357421875e-07, "loss": 0.0026, "reward": 1.847311556339264, "reward_std": 0.07765659689903259, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8551241159439087, "step": 2756 }, { "clip_ratio": 0.0, "completion_length": 247.9921875, "epoch": 1.3466796875, "grad_norm": 2.2755146480765056, "kl": 0.078857421875, "learning_rate": 6.634521484375e-07, "loss": 0.0031, "reward": 1.8561453819274902, "reward_std": 0.04817195236682892, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8561453819274902, "step": 2757 }, { "clip_ratio": 0.0, "completion_length": 385.390625, "epoch": 1.34716796875, "grad_norm": 4.259565415030043, "kl": 0.064697265625, "learning_rate": 6.63330078125e-07, "loss": 0.0026, "reward": 1.7508844137191772, "reward_std": 0.14695337787270546, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7665094435214996, "step": 2758 }, { "clip_ratio": 0.0, "completion_length": 281.4296875, "epoch": 1.34765625, "grad_norm": 1.761996511284053, "kl": 0.07861328125, "learning_rate": 6.632080078124999e-07, "loss": 0.0031, "reward": 1.8259143233299255, "reward_std": 0.11976262181997299, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8337267935276031, "step": 2759 }, { "clip_ratio": 0.0, "completion_length": 261.109375, "epoch": 1.34814453125, "grad_norm": 1.1530002116392095, "kl": 0.075439453125, "learning_rate": 6.630859374999999e-07, "loss": 0.003, "reward": 1.8094568252563477, "reward_std": 0.09631854109466076, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8407068252563477, "step": 2760 }, { "clip_ratio": 0.0, "completion_length": 264.8671875, "epoch": 1.3486328125, "grad_norm": 1.9409396087042068, "kl": 0.0567626953125, "learning_rate": 6.629638671875e-07, "loss": 0.0023, "reward": 1.8710192441940308, "reward_std": 0.061006827279925346, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8710193037986755, "step": 2761 }, { "clip_ratio": 0.0, "completion_length": 253.1171875, "epoch": 1.34912109375, "grad_norm": 0.7799863900997498, "kl": 0.085693359375, "learning_rate": 6.62841796875e-07, "loss": 0.0034, "reward": 1.77259761095047, "reward_std": 0.05002701282501221, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7725976407527924, "step": 2762 }, { "clip_ratio": 0.0, "completion_length": 291.484375, "epoch": 1.349609375, "grad_norm": 2.9890180449063792, "kl": 0.072509765625, "learning_rate": 6.627197265625e-07, "loss": 0.0029, "reward": 1.758193016052246, "reward_std": 0.11531753093004227, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7581930160522461, "step": 2763 }, { "clip_ratio": 0.0, "completion_length": 422.0234375, "epoch": 1.35009765625, "grad_norm": 3.384788495208568, "kl": 0.090087890625, "learning_rate": 6.6259765625e-07, "loss": 0.0036, "reward": 1.7781252264976501, "reward_std": 0.04981714114546776, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7781251966953278, "step": 2764 }, { "clip_ratio": 0.0, "completion_length": 282.109375, "epoch": 1.3505859375, "grad_norm": 2.0776218039129155, "kl": 0.08349609375, "learning_rate": 6.624755859374999e-07, "loss": 0.0033, "reward": 1.7179874777793884, "reward_std": 0.06420490704476833, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7179875075817108, "step": 2765 }, { "clip_ratio": 0.0, "completion_length": 315.3203125, "epoch": 1.35107421875, "grad_norm": 1.3042813145233396, "kl": 0.071533203125, "learning_rate": 6.623535156249999e-07, "loss": 0.0029, "reward": 1.706727385520935, "reward_std": 0.04921235144138336, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7067274153232574, "step": 2766 }, { "clip_ratio": 0.0, "completion_length": 349.0625, "epoch": 1.3515625, "grad_norm": 0.8583618689491556, "kl": 0.05859375, "learning_rate": 6.622314453125e-07, "loss": 0.0023, "reward": 1.8275976777076721, "reward_std": 0.051028769463300705, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8275976479053497, "step": 2767 }, { "clip_ratio": 0.0, "completion_length": 233.046875, "epoch": 1.35205078125, "grad_norm": 1.3873179964338491, "kl": 0.0771484375, "learning_rate": 6.62109375e-07, "loss": 0.0031, "reward": 1.7254577279090881, "reward_std": 0.12088143825531006, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7254576981067657, "step": 2768 }, { "clip_ratio": 0.0, "completion_length": 462.703125, "epoch": 1.3525390625, "grad_norm": 1.5492890007706852, "kl": 0.09521484375, "learning_rate": 6.619873046875e-07, "loss": 0.0038, "reward": 1.7280957102775574, "reward_std": 0.10262476652860641, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7359082102775574, "step": 2769 }, { "clip_ratio": 0.0, "completion_length": 279.53125, "epoch": 1.35302734375, "grad_norm": 2.3373926002613374, "kl": 0.065185546875, "learning_rate": 6.61865234375e-07, "loss": 0.0026, "reward": 1.7113505005836487, "reward_std": 0.056995073333382607, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7113505005836487, "step": 2770 }, { "clip_ratio": 0.0, "completion_length": 251.609375, "epoch": 1.353515625, "grad_norm": 4.3744425979305355, "kl": 0.0986328125, "learning_rate": 6.617431640625e-07, "loss": 0.0039, "reward": 1.7181638479232788, "reward_std": 0.08497333526611328, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7181638479232788, "step": 2771 }, { "clip_ratio": 0.0, "completion_length": 309.96875, "epoch": 1.35400390625, "grad_norm": 14.857357422003494, "kl": 0.0657958984375, "learning_rate": 6.616210937499999e-07, "loss": 0.0026, "reward": 1.888843595981598, "reward_std": 0.046841708943247795, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8888436257839203, "step": 2772 }, { "clip_ratio": 0.0, "completion_length": 354.734375, "epoch": 1.3544921875, "grad_norm": 1.8069280935451315, "kl": 0.07373046875, "learning_rate": 6.614990234374999e-07, "loss": 0.0029, "reward": 1.868907868862152, "reward_std": 0.05142470262944698, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8689078092575073, "step": 2773 }, { "clip_ratio": 0.0, "completion_length": 231.4609375, "epoch": 1.35498046875, "grad_norm": 1.9204075579327249, "kl": 0.069091796875, "learning_rate": 6.61376953125e-07, "loss": 0.0028, "reward": 1.8886531591415405, "reward_std": 0.08299789018929005, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8886531889438629, "step": 2774 }, { "clip_ratio": 0.0, "completion_length": 271.4140625, "epoch": 1.35546875, "grad_norm": 1.6906533792821081, "kl": 0.0618896484375, "learning_rate": 6.612548828125e-07, "loss": 0.0025, "reward": 1.8281062841415405, "reward_std": 0.094516322016716, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8437312543392181, "step": 2775 }, { "clip_ratio": 0.0, "completion_length": 302.765625, "epoch": 1.35595703125, "grad_norm": 1.6386886335437982, "kl": 0.09033203125, "learning_rate": 6.611328125e-07, "loss": 0.0036, "reward": 1.6473196148872375, "reward_std": 0.07153589557856321, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6473196148872375, "step": 2776 }, { "clip_ratio": 0.0, "completion_length": 285.6328125, "epoch": 1.3564453125, "grad_norm": 3.776389069468664, "kl": 0.0716552734375, "learning_rate": 6.610107421875e-07, "loss": 0.0029, "reward": 1.7405164241790771, "reward_std": 0.11073359847068787, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7483289241790771, "step": 2777 }, { "clip_ratio": 0.0, "completion_length": 259.328125, "epoch": 1.35693359375, "grad_norm": 1.1085765846395907, "kl": 0.0662841796875, "learning_rate": 6.608886718749999e-07, "loss": 0.0027, "reward": 1.8612353205680847, "reward_std": 0.05960194766521454, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8612352311611176, "step": 2778 }, { "clip_ratio": 0.0, "completion_length": 323.8359375, "epoch": 1.357421875, "grad_norm": 1.1688355930446612, "kl": 0.075927734375, "learning_rate": 6.607666015624999e-07, "loss": 0.003, "reward": 1.668241798877716, "reward_std": 0.0824052020907402, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6682417988777161, "step": 2779 }, { "clip_ratio": 0.0, "completion_length": 288.921875, "epoch": 1.35791015625, "grad_norm": 1.80133842186887, "kl": 0.0592041015625, "learning_rate": 6.6064453125e-07, "loss": 0.0024, "reward": 1.8754128217697144, "reward_std": 0.02819860354065895, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.875412791967392, "step": 2780 }, { "clip_ratio": 0.0, "completion_length": 286.015625, "epoch": 1.3583984375, "grad_norm": 2.9845705873686876, "kl": 0.082763671875, "learning_rate": 6.605224609375e-07, "loss": 0.0033, "reward": 1.837379813194275, "reward_std": 0.0600747037678957, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8373798131942749, "step": 2781 }, { "clip_ratio": 0.0, "completion_length": 200.03125, "epoch": 1.35888671875, "grad_norm": 1.9146016266718324, "kl": 0.072021484375, "learning_rate": 6.60400390625e-07, "loss": 0.0029, "reward": 1.8498224020004272, "reward_std": 0.07501747971400619, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8498223423957825, "step": 2782 }, { "clip_ratio": 0.0, "completion_length": 285.9375, "epoch": 1.359375, "grad_norm": 0.9660214761219784, "kl": 0.066162109375, "learning_rate": 6.602783203125e-07, "loss": 0.0026, "reward": 1.708676815032959, "reward_std": 0.05009671114385128, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.708676815032959, "step": 2783 }, { "clip_ratio": 0.0, "completion_length": 266.0390625, "epoch": 1.35986328125, "grad_norm": 1.9126001884812758, "kl": 0.06884765625, "learning_rate": 6.6015625e-07, "loss": 0.0027, "reward": 1.7439513802528381, "reward_std": 0.05504639446735382, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7439513504505157, "step": 2784 }, { "clip_ratio": 0.0, "completion_length": 259.140625, "epoch": 1.3603515625, "grad_norm": 2.6388109061222647, "kl": 0.0625, "learning_rate": 6.600341796874999e-07, "loss": 0.0025, "reward": 1.8127487897872925, "reward_std": 0.05138452537357807, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8127487897872925, "step": 2785 }, { "clip_ratio": 0.0, "completion_length": 255.34375, "epoch": 1.36083984375, "grad_norm": 1.9324545434757587, "kl": 0.0615234375, "learning_rate": 6.599121093749999e-07, "loss": 0.0025, "reward": 1.8103123307228088, "reward_std": 0.08414742723107338, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8103123307228088, "step": 2786 }, { "clip_ratio": 0.0, "completion_length": 306.15625, "epoch": 1.361328125, "grad_norm": 1.710010117700723, "kl": 0.07861328125, "learning_rate": 6.597900390625e-07, "loss": 0.0031, "reward": 1.8647686839103699, "reward_std": 0.07051170617341995, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8647686839103699, "step": 2787 }, { "clip_ratio": 0.0, "completion_length": 252.65625, "epoch": 1.36181640625, "grad_norm": 3.5098494042536115, "kl": 0.09814453125, "learning_rate": 6.5966796875e-07, "loss": 0.0039, "reward": 1.811535358428955, "reward_std": 0.17406302690505981, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8115352988243103, "step": 2788 }, { "clip_ratio": 0.0, "completion_length": 281.03125, "epoch": 1.3623046875, "grad_norm": 4.488729872719589, "kl": 0.054931640625, "learning_rate": 6.595458984375e-07, "loss": 0.0022, "reward": 1.823096752166748, "reward_std": 0.06310966797173023, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8230966925621033, "step": 2789 }, { "clip_ratio": 0.0, "completion_length": 284.9453125, "epoch": 1.36279296875, "grad_norm": 2.859947402634798, "kl": 0.0673828125, "learning_rate": 6.59423828125e-07, "loss": 0.0027, "reward": 1.7696452736854553, "reward_std": 0.09921448305249214, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7696452140808105, "step": 2790 }, { "clip_ratio": 0.0, "completion_length": 293.6640625, "epoch": 1.36328125, "grad_norm": 1.8490800827770573, "kl": 0.08642578125, "learning_rate": 6.593017578124999e-07, "loss": 0.0035, "reward": 1.77943754196167, "reward_std": 0.06473535671830177, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7794375717639923, "step": 2791 }, { "clip_ratio": 0.0, "completion_length": 380.546875, "epoch": 1.36376953125, "grad_norm": 1.1874902147313509, "kl": 0.0633544921875, "learning_rate": 6.591796874999999e-07, "loss": 0.0025, "reward": 1.8052760362625122, "reward_std": 0.05710322968661785, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8052760064601898, "step": 2792 }, { "clip_ratio": 0.0, "completion_length": 279.5078125, "epoch": 1.3642578125, "grad_norm": 1.3877294374512281, "kl": 0.0810546875, "learning_rate": 6.590576171875e-07, "loss": 0.0032, "reward": 1.7053526639938354, "reward_std": 0.044744652695953846, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7053526639938354, "step": 2793 }, { "clip_ratio": 0.0, "completion_length": 252.2890625, "epoch": 1.36474609375, "grad_norm": 1.7614738212994774, "kl": 0.091064453125, "learning_rate": 6.58935546875e-07, "loss": 0.0037, "reward": 1.8614672422409058, "reward_std": 0.06470566987991333, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8692797422409058, "step": 2794 }, { "clip_ratio": 0.0, "completion_length": 236.9921875, "epoch": 1.365234375, "grad_norm": 0.8216449129468091, "kl": 0.0640869140625, "learning_rate": 6.588134765625e-07, "loss": 0.0026, "reward": 1.7944404482841492, "reward_std": 0.07953635044395924, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8022529482841492, "step": 2795 }, { "clip_ratio": 0.0, "completion_length": 249.2890625, "epoch": 1.36572265625, "grad_norm": 3.7784315403086732, "kl": 0.0648193359375, "learning_rate": 6.5869140625e-07, "loss": 0.0026, "reward": 1.7856959700584412, "reward_std": 0.06272900477051735, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7856959700584412, "step": 2796 }, { "clip_ratio": 0.0, "completion_length": 237.5625, "epoch": 1.3662109375, "grad_norm": 1.9366371199429389, "kl": 0.0728759765625, "learning_rate": 6.585693359375e-07, "loss": 0.0029, "reward": 1.7936596274375916, "reward_std": 0.07175188139081001, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7936596572399139, "step": 2797 }, { "clip_ratio": 0.0, "completion_length": 289.078125, "epoch": 1.36669921875, "grad_norm": 1.3702102849644753, "kl": 0.063720703125, "learning_rate": 6.584472656249999e-07, "loss": 0.0025, "reward": 1.7357134819030762, "reward_std": 0.09683592431247234, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7435259819030762, "step": 2798 }, { "clip_ratio": 0.0, "completion_length": 323.40625, "epoch": 1.3671875, "grad_norm": 0.7898266031129043, "kl": 0.082275390625, "learning_rate": 6.583251953124999e-07, "loss": 0.0033, "reward": 1.7490665912628174, "reward_std": 0.09386800974607468, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7568791508674622, "step": 2799 }, { "clip_ratio": 0.0, "completion_length": 317.4296875, "epoch": 1.36767578125, "grad_norm": 1.1152710272393713, "kl": 0.06396484375, "learning_rate": 6.58203125e-07, "loss": 0.0026, "reward": 1.8169459700584412, "reward_std": 0.05791633389890194, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8169459104537964, "step": 2800 }, { "clip_ratio": 0.0, "completion_length": 313.734375, "epoch": 1.3681640625, "grad_norm": 4.796604127693978, "kl": 0.0958251953125, "learning_rate": 6.580810546875e-07, "loss": 0.0038, "reward": 1.720919132232666, "reward_std": 0.08066519349813461, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7756066620349884, "step": 2801 }, { "clip_ratio": 0.0, "completion_length": 246.234375, "epoch": 1.36865234375, "grad_norm": 0.7665830601343561, "kl": 0.065673828125, "learning_rate": 6.57958984375e-07, "loss": 0.0026, "reward": 1.7669459581375122, "reward_std": 0.052986389957368374, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7747583985328674, "step": 2802 }, { "clip_ratio": 0.0, "completion_length": 291.2109375, "epoch": 1.369140625, "grad_norm": 0.8869108480679029, "kl": 0.052490234375, "learning_rate": 6.578369140625e-07, "loss": 0.0021, "reward": 1.7913293838500977, "reward_std": 0.057568637654185295, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7913293838500977, "step": 2803 }, { "clip_ratio": 0.0, "completion_length": 234.9140625, "epoch": 1.36962890625, "grad_norm": 1.1017743459095004, "kl": 0.079833984375, "learning_rate": 6.577148437499999e-07, "loss": 0.0032, "reward": 1.762194275856018, "reward_std": 0.027863549068570137, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7621943652629852, "step": 2804 }, { "clip_ratio": 0.0, "completion_length": 229.3671875, "epoch": 1.3701171875, "grad_norm": 5.067708962892902, "kl": 0.064697265625, "learning_rate": 6.575927734374999e-07, "loss": 0.0026, "reward": 1.6529717445373535, "reward_std": 0.07605472579598427, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6529717445373535, "step": 2805 }, { "clip_ratio": 0.0, "completion_length": 223.3203125, "epoch": 1.37060546875, "grad_norm": 3.5790933037952426, "kl": 0.07470703125, "learning_rate": 6.57470703125e-07, "loss": 0.003, "reward": 1.6717053651809692, "reward_std": 0.06556748226284981, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6717053353786469, "step": 2806 }, { "clip_ratio": 0.0, "completion_length": 389.28125, "epoch": 1.37109375, "grad_norm": 1.1882639853118668, "kl": 0.0606689453125, "learning_rate": 6.573486328125e-07, "loss": 0.0024, "reward": 1.7812290787696838, "reward_std": 0.11782369762659073, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8046665787696838, "step": 2807 }, { "clip_ratio": 0.0, "completion_length": 368.7734375, "epoch": 1.37158203125, "grad_norm": 1.0058167484415166, "kl": 0.0628662109375, "learning_rate": 6.572265625e-07, "loss": 0.0025, "reward": 1.658070147037506, "reward_std": 0.16661040857434273, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7049451470375061, "step": 2808 }, { "clip_ratio": 0.0, "completion_length": 258.3359375, "epoch": 1.3720703125, "grad_norm": 1.5691798635044372, "kl": 0.077392578125, "learning_rate": 6.571044921875e-07, "loss": 0.0031, "reward": 1.8431367874145508, "reward_std": 0.015472855884581804, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.843136727809906, "step": 2809 }, { "clip_ratio": 0.0, "completion_length": 373.0546875, "epoch": 1.37255859375, "grad_norm": 30.022616938561168, "kl": 0.063232421875, "learning_rate": 6.56982421875e-07, "loss": 0.0025, "reward": 1.5678275227546692, "reward_std": 0.1511671245098114, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5912650525569916, "step": 2810 }, { "clip_ratio": 0.0, "completion_length": 232.734375, "epoch": 1.373046875, "grad_norm": 2.0753106666854597, "kl": 0.070068359375, "learning_rate": 6.568603515624999e-07, "loss": 0.0028, "reward": 1.705108404159546, "reward_std": 0.14512356370687485, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7207334041595459, "step": 2811 }, { "clip_ratio": 0.0, "completion_length": 357.1796875, "epoch": 1.37353515625, "grad_norm": 1.3679898475411463, "kl": 0.052001953125, "learning_rate": 6.567382812499999e-07, "loss": 0.0021, "reward": 1.8090946078300476, "reward_std": 0.037844820879399776, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8090946674346924, "step": 2812 }, { "clip_ratio": 0.0, "completion_length": 267.859375, "epoch": 1.3740234375, "grad_norm": 1.9064678334594833, "kl": 0.0635986328125, "learning_rate": 6.566162109375e-07, "loss": 0.0025, "reward": 1.765863299369812, "reward_std": 0.07344381138682365, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.765863299369812, "step": 2813 }, { "clip_ratio": 0.0, "completion_length": 362.28125, "epoch": 1.37451171875, "grad_norm": 0.7913906152817536, "kl": 0.0477294921875, "learning_rate": 6.56494140625e-07, "loss": 0.0019, "reward": 1.7781551480293274, "reward_std": 0.05877980962395668, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7859676480293274, "step": 2814 }, { "clip_ratio": 0.0, "completion_length": 407.9375, "epoch": 1.375, "grad_norm": 3.040552646693815, "kl": 0.0626220703125, "learning_rate": 6.563720703125e-07, "loss": 0.0025, "reward": 1.8006147146224976, "reward_std": 0.11222148686647415, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.80842724442482, "step": 2815 }, { "clip_ratio": 0.0, "completion_length": 310.09375, "epoch": 1.37548828125, "grad_norm": 1.1902464862626296, "kl": 0.07080078125, "learning_rate": 6.5625e-07, "loss": 0.0028, "reward": 1.7808747291564941, "reward_std": 0.06534177996218204, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7808747291564941, "step": 2816 }, { "clip_ratio": 0.0, "completion_length": 367.5390625, "epoch": 1.3759765625, "grad_norm": 1.1270478089250884, "kl": 0.0849609375, "learning_rate": 6.561279296875e-07, "loss": 0.0034, "reward": 1.6131686568260193, "reward_std": 0.13650833070278168, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.6678561568260193, "step": 2817 }, { "clip_ratio": 0.0, "completion_length": 346.578125, "epoch": 1.37646484375, "grad_norm": 2.463194086778472, "kl": 0.083251953125, "learning_rate": 6.560058593749999e-07, "loss": 0.0033, "reward": 1.743131935596466, "reward_std": 0.13736629113554955, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7743819355964661, "step": 2818 }, { "clip_ratio": 0.0, "completion_length": 296.8203125, "epoch": 1.376953125, "grad_norm": 1.655877833888544, "kl": 0.061767578125, "learning_rate": 6.558837890625e-07, "loss": 0.0025, "reward": 1.8583369255065918, "reward_std": 0.06905535236001015, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8583369851112366, "step": 2819 }, { "clip_ratio": 0.0, "completion_length": 338.0703125, "epoch": 1.37744140625, "grad_norm": 1.1615671365734472, "kl": 0.064208984375, "learning_rate": 6.5576171875e-07, "loss": 0.0026, "reward": 1.8210389018058777, "reward_std": 0.10824690014123917, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8366638422012329, "step": 2820 }, { "clip_ratio": 0.0, "completion_length": 290.8515625, "epoch": 1.3779296875, "grad_norm": 1.9462189546737534, "kl": 0.072265625, "learning_rate": 6.556396484375e-07, "loss": 0.0029, "reward": 1.7309507727622986, "reward_std": 0.12317908834666014, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7543882727622986, "step": 2821 }, { "clip_ratio": 0.0, "completion_length": 285.6796875, "epoch": 1.37841796875, "grad_norm": 0.7250821619105872, "kl": 0.0653076171875, "learning_rate": 6.55517578125e-07, "loss": 0.0026, "reward": 1.7630040049552917, "reward_std": 0.028386560268700123, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7630040049552917, "step": 2822 }, { "clip_ratio": 0.0, "completion_length": 418.078125, "epoch": 1.37890625, "grad_norm": 1.5496349062279748, "kl": 0.06787109375, "learning_rate": 6.553955078125e-07, "loss": 0.0027, "reward": 1.7218654155731201, "reward_std": 0.14879543986171484, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7374904155731201, "step": 2823 }, { "clip_ratio": 0.0, "completion_length": 271.3515625, "epoch": 1.37939453125, "grad_norm": 0.8887945048990987, "kl": 0.067626953125, "learning_rate": 6.552734374999999e-07, "loss": 0.0027, "reward": 1.754398226737976, "reward_std": 0.029574115527793765, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7543981671333313, "step": 2824 }, { "clip_ratio": 0.0, "completion_length": 311.0, "epoch": 1.3798828125, "grad_norm": 1.0309054013771692, "kl": 0.0648193359375, "learning_rate": 6.551513671874999e-07, "loss": 0.0026, "reward": 1.9179275035858154, "reward_std": 0.013918052427470684, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9179275631904602, "step": 2825 }, { "clip_ratio": 0.0, "completion_length": 258.8125, "epoch": 1.38037109375, "grad_norm": 1.6179425907866827, "kl": 0.062744140625, "learning_rate": 6.55029296875e-07, "loss": 0.0025, "reward": 1.829226016998291, "reward_std": 0.09412107616662979, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8292260468006134, "step": 2826 }, { "clip_ratio": 0.0, "completion_length": 301.25, "epoch": 1.380859375, "grad_norm": 1.5403818911864977, "kl": 0.0736083984375, "learning_rate": 6.549072265625e-07, "loss": 0.0029, "reward": 1.7663615942001343, "reward_std": 0.02866467647254467, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7663616240024567, "step": 2827 }, { "clip_ratio": 0.0, "completion_length": 262.265625, "epoch": 1.38134765625, "grad_norm": 0.8033462865951467, "kl": 0.07373046875, "learning_rate": 6.5478515625e-07, "loss": 0.003, "reward": 1.855335772037506, "reward_std": 0.044215379282832146, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8553357422351837, "step": 2828 }, { "clip_ratio": 0.0, "completion_length": 340.796875, "epoch": 1.3818359375, "grad_norm": 2.522746992374405, "kl": 0.060791015625, "learning_rate": 6.546630859375e-07, "loss": 0.0024, "reward": 1.647928237915039, "reward_std": 0.13220302015542984, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6713657677173615, "step": 2829 }, { "clip_ratio": 0.0, "completion_length": 363.5703125, "epoch": 1.38232421875, "grad_norm": 1.7436794453218056, "kl": 0.0604248046875, "learning_rate": 6.54541015625e-07, "loss": 0.0024, "reward": 1.6440320014953613, "reward_std": 0.09271154180169106, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6596570014953613, "step": 2830 }, { "clip_ratio": 0.0, "completion_length": 228.453125, "epoch": 1.3828125, "grad_norm": 0.4931724331120224, "kl": 0.063232421875, "learning_rate": 6.544189453124999e-07, "loss": 0.0025, "reward": 1.8260149955749512, "reward_std": 0.024739277781918645, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8260150253772736, "step": 2831 }, { "clip_ratio": 0.0, "completion_length": 177.4765625, "epoch": 1.38330078125, "grad_norm": 1.093941327614133, "kl": 0.06396484375, "learning_rate": 6.54296875e-07, "loss": 0.0026, "reward": 1.7941319942474365, "reward_std": 0.017439838498830795, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7941320240497589, "step": 2832 }, { "clip_ratio": 0.0, "completion_length": 299.9296875, "epoch": 1.3837890625, "grad_norm": 1.0585896307141636, "kl": 0.060791015625, "learning_rate": 6.541748046875e-07, "loss": 0.0024, "reward": 1.8979597091674805, "reward_std": 0.05379013530910015, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8979597091674805, "step": 2833 }, { "clip_ratio": 0.0, "completion_length": 387.5390625, "epoch": 1.38427734375, "grad_norm": 2.579223044774424, "kl": 0.079345703125, "learning_rate": 6.54052734375e-07, "loss": 0.0032, "reward": 1.6481378078460693, "reward_std": 0.10650475323200226, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6559503078460693, "step": 2834 }, { "clip_ratio": 0.0, "completion_length": 303.546875, "epoch": 1.384765625, "grad_norm": 2.5957275431327025, "kl": 0.074951171875, "learning_rate": 6.539306640625e-07, "loss": 0.003, "reward": 1.7491188645362854, "reward_std": 0.04664234071969986, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7491189241409302, "step": 2835 }, { "clip_ratio": 0.0, "completion_length": 372.0078125, "epoch": 1.38525390625, "grad_norm": 1.4332709494413776, "kl": 0.08251953125, "learning_rate": 6.5380859375e-07, "loss": 0.0033, "reward": 1.7521470189094543, "reward_std": 0.07698429748415947, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7521470487117767, "step": 2836 }, { "clip_ratio": 0.0, "completion_length": 306.3359375, "epoch": 1.3857421875, "grad_norm": 0.8961647450408224, "kl": 0.101806640625, "learning_rate": 6.536865234374999e-07, "loss": 0.0041, "reward": 1.7263582348823547, "reward_std": 0.05472889542579651, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7263582050800323, "step": 2837 }, { "clip_ratio": 0.0, "completion_length": 345.578125, "epoch": 1.38623046875, "grad_norm": 1.417539744141072, "kl": 0.077392578125, "learning_rate": 6.535644531249999e-07, "loss": 0.0031, "reward": 1.645662248134613, "reward_std": 0.10042403638362885, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6534747779369354, "step": 2838 }, { "clip_ratio": 0.0, "completion_length": 193.140625, "epoch": 1.38671875, "grad_norm": 6.057330374770775, "kl": 0.14404296875, "learning_rate": 6.534423828125e-07, "loss": 0.0058, "reward": 1.7092814445495605, "reward_std": 0.02909145038574934, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7092815637588501, "step": 2839 }, { "clip_ratio": 0.0, "completion_length": 376.078125, "epoch": 1.38720703125, "grad_norm": 0.9907276256447438, "kl": 0.06494140625, "learning_rate": 6.533203125e-07, "loss": 0.0026, "reward": 1.674263060092926, "reward_std": 0.17288543283939362, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7133256196975708, "step": 2840 }, { "clip_ratio": 0.0, "completion_length": 340.8125, "epoch": 1.3876953125, "grad_norm": 72.38821291532156, "kl": 0.57373046875, "learning_rate": 6.531982421875e-07, "loss": 0.0229, "reward": 1.7409939765930176, "reward_std": 0.14576169103384018, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7566189765930176, "step": 2841 }, { "clip_ratio": 0.0, "completion_length": 213.6328125, "epoch": 1.38818359375, "grad_norm": 0.7516084081038863, "kl": 0.07373046875, "learning_rate": 6.53076171875e-07, "loss": 0.003, "reward": 1.7337377667427063, "reward_std": 0.05978046730160713, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7337378263473511, "step": 2842 }, { "clip_ratio": 0.0, "completion_length": 336.734375, "epoch": 1.388671875, "grad_norm": 1.1741854698445926, "kl": 0.0694580078125, "learning_rate": 6.529541015625e-07, "loss": 0.0028, "reward": 1.6884747743606567, "reward_std": 0.12016388587653637, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7040997445583344, "step": 2843 }, { "clip_ratio": 0.0, "completion_length": 305.5078125, "epoch": 1.38916015625, "grad_norm": 1.7849395697497092, "kl": 0.0703125, "learning_rate": 6.528320312499999e-07, "loss": 0.0028, "reward": 1.7630398273468018, "reward_std": 0.0514018889516592, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7630398273468018, "step": 2844 }, { "clip_ratio": 0.0, "completion_length": 361.734375, "epoch": 1.3896484375, "grad_norm": 1.2904172737765798, "kl": 0.065673828125, "learning_rate": 6.527099609375e-07, "loss": 0.0026, "reward": 1.7109894752502441, "reward_std": 0.037571437656879425, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7109894752502441, "step": 2845 }, { "clip_ratio": 0.0, "completion_length": 310.0859375, "epoch": 1.39013671875, "grad_norm": 0.9983955556960216, "kl": 0.073974609375, "learning_rate": 6.52587890625e-07, "loss": 0.003, "reward": 1.7116398215293884, "reward_std": 0.061076716519892216, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.711639791727066, "step": 2846 }, { "clip_ratio": 0.0, "completion_length": 281.71875, "epoch": 1.390625, "grad_norm": 0.7920028351802623, "kl": 0.068603515625, "learning_rate": 6.524658203125e-07, "loss": 0.0027, "reward": 1.78658789396286, "reward_std": 0.06295670091640204, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7944003939628601, "step": 2847 }, { "clip_ratio": 0.0, "completion_length": 319.828125, "epoch": 1.39111328125, "grad_norm": 2.1506316903194094, "kl": 0.06982421875, "learning_rate": 6.5234375e-07, "loss": 0.0028, "reward": 1.8838631510734558, "reward_std": 0.06648493744432926, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8916757106781006, "step": 2848 }, { "clip_ratio": 0.0, "completion_length": 308.578125, "epoch": 1.3916015625, "grad_norm": 1.1112381345863096, "kl": 0.07421875, "learning_rate": 6.522216796875e-07, "loss": 0.003, "reward": 1.712832510471344, "reward_std": 0.07860787212848663, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7128325700759888, "step": 2849 }, { "clip_ratio": 0.0, "completion_length": 306.3828125, "epoch": 1.39208984375, "grad_norm": 1.609890133849202, "kl": 0.0626220703125, "learning_rate": 6.520996093749999e-07, "loss": 0.0025, "reward": 1.802548348903656, "reward_std": 0.039428723976016045, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8025482892990112, "step": 2850 }, { "clip_ratio": 0.0, "completion_length": 338.8828125, "epoch": 1.392578125, "grad_norm": 2.1181979161526345, "kl": 0.09521484375, "learning_rate": 6.519775390624999e-07, "loss": 0.0038, "reward": 1.7357767820358276, "reward_std": 0.08965800702571869, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7357767522335052, "step": 2851 }, { "clip_ratio": 0.0, "completion_length": 334.421875, "epoch": 1.39306640625, "grad_norm": 6.007115954333245, "kl": 0.0819091796875, "learning_rate": 6.5185546875e-07, "loss": 0.0033, "reward": 1.7287788391113281, "reward_std": 0.08958043158054352, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7287788093090057, "step": 2852 }, { "clip_ratio": 0.0, "completion_length": 306.3125, "epoch": 1.3935546875, "grad_norm": 1.2649144166568242, "kl": 0.075927734375, "learning_rate": 6.517333984375e-07, "loss": 0.003, "reward": 1.6118924021720886, "reward_std": 0.1063384860754013, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6197049021720886, "step": 2853 }, { "clip_ratio": 0.0, "completion_length": 270.6875, "epoch": 1.39404296875, "grad_norm": 4.371721447909433, "kl": 0.08251953125, "learning_rate": 6.51611328125e-07, "loss": 0.0033, "reward": 1.5534948110580444, "reward_std": 0.04652980901300907, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5534948408603668, "step": 2854 }, { "clip_ratio": 0.0, "completion_length": 249.296875, "epoch": 1.39453125, "grad_norm": 1.1589218702703599, "kl": 0.076904296875, "learning_rate": 6.514892578125e-07, "loss": 0.0031, "reward": 1.6414119601249695, "reward_std": 0.059677837416529655, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6492244899272919, "step": 2855 }, { "clip_ratio": 0.0, "completion_length": 300.328125, "epoch": 1.39501953125, "grad_norm": 2.7183716635470994, "kl": 0.077392578125, "learning_rate": 6.513671875e-07, "loss": 0.0031, "reward": 1.732553243637085, "reward_std": 0.04065544903278351, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7325531840324402, "step": 2856 }, { "clip_ratio": 0.0, "completion_length": 318.0078125, "epoch": 1.3955078125, "grad_norm": 2.0094522160396378, "kl": 0.0869140625, "learning_rate": 6.512451171874999e-07, "loss": 0.0035, "reward": 1.7317038774490356, "reward_std": 0.04309108108282089, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7317038774490356, "step": 2857 }, { "clip_ratio": 0.0, "completion_length": 290.1015625, "epoch": 1.39599609375, "grad_norm": 1.7850835774009906, "kl": 0.0718994140625, "learning_rate": 6.511230468749999e-07, "loss": 0.0029, "reward": 1.751394808292389, "reward_std": 0.09872918948531151, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7513948082923889, "step": 2858 }, { "clip_ratio": 0.0, "completion_length": 351.53125, "epoch": 1.396484375, "grad_norm": 2.3667754366875067, "kl": 0.080810546875, "learning_rate": 6.510009765625e-07, "loss": 0.0032, "reward": 1.749050259590149, "reward_std": 0.057237736880779266, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7490502893924713, "step": 2859 }, { "clip_ratio": 0.0, "completion_length": 265.71875, "epoch": 1.39697265625, "grad_norm": 4.5044551506426656, "kl": 0.0618896484375, "learning_rate": 6.5087890625e-07, "loss": 0.0025, "reward": 1.816435694694519, "reward_std": 0.05485322326421738, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.816435694694519, "step": 2860 }, { "clip_ratio": 0.0, "completion_length": 347.5625, "epoch": 1.3974609375, "grad_norm": 0.9423463055806306, "kl": 0.073486328125, "learning_rate": 6.507568359375e-07, "loss": 0.0029, "reward": 1.7192687392234802, "reward_std": 0.057240571826696396, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7192686796188354, "step": 2861 }, { "clip_ratio": 0.0, "completion_length": 336.0546875, "epoch": 1.39794921875, "grad_norm": 4.006446009955896, "kl": 0.070068359375, "learning_rate": 6.50634765625e-07, "loss": 0.0028, "reward": 1.7493478059768677, "reward_std": 0.08323598839342594, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7493478059768677, "step": 2862 }, { "clip_ratio": 0.0, "completion_length": 212.28125, "epoch": 1.3984375, "grad_norm": 1.1324756629593455, "kl": 0.075439453125, "learning_rate": 6.505126953124999e-07, "loss": 0.003, "reward": 1.8583284616470337, "reward_std": 0.01956217922270298, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8583284616470337, "step": 2863 }, { "clip_ratio": 0.0, "completion_length": 276.046875, "epoch": 1.39892578125, "grad_norm": 1.583271479867273, "kl": 0.0682373046875, "learning_rate": 6.503906249999999e-07, "loss": 0.0027, "reward": 1.8412460088729858, "reward_std": 0.11462399363517761, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8490585088729858, "step": 2864 }, { "clip_ratio": 0.0, "completion_length": 241.421875, "epoch": 1.3994140625, "grad_norm": 1.1615901347183661, "kl": 0.0711669921875, "learning_rate": 6.502685546875e-07, "loss": 0.0028, "reward": 1.7544441223144531, "reward_std": 0.05260493792593479, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7544440627098083, "step": 2865 }, { "clip_ratio": 0.0, "completion_length": 300.6875, "epoch": 1.39990234375, "grad_norm": 7.6165646200529284, "kl": 0.0791015625, "learning_rate": 6.50146484375e-07, "loss": 0.0032, "reward": 1.7561290264129639, "reward_std": 0.09663645923137665, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7717540264129639, "step": 2866 }, { "clip_ratio": 0.0, "completion_length": 366.28125, "epoch": 1.400390625, "grad_norm": 1.1725742243241242, "kl": 0.069091796875, "learning_rate": 6.500244140625e-07, "loss": 0.0028, "reward": 1.6970765590667725, "reward_std": 0.13552076928317547, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7048889398574829, "step": 2867 }, { "clip_ratio": 0.0, "completion_length": 232.578125, "epoch": 1.40087890625, "grad_norm": 3.094101668652219, "kl": 0.0693359375, "learning_rate": 6.4990234375e-07, "loss": 0.0028, "reward": 1.7442750334739685, "reward_std": 0.05703293904662132, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7442750930786133, "step": 2868 }, { "clip_ratio": 0.0, "completion_length": 309.859375, "epoch": 1.4013671875, "grad_norm": 3.527416892215106, "kl": 0.0927734375, "learning_rate": 6.497802734375e-07, "loss": 0.0037, "reward": 1.658632516860962, "reward_std": 0.05164991691708565, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6586325764656067, "step": 2869 }, { "clip_ratio": 0.0, "completion_length": 263.890625, "epoch": 1.40185546875, "grad_norm": 8.518640897558415, "kl": 0.09228515625, "learning_rate": 6.496582031249999e-07, "loss": 0.0037, "reward": 1.7752405405044556, "reward_std": 0.04082014970481396, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7752405405044556, "step": 2870 }, { "clip_ratio": 0.0, "completion_length": 377.3046875, "epoch": 1.40234375, "grad_norm": 1.2735396040272664, "kl": 0.0635986328125, "learning_rate": 6.495361328124999e-07, "loss": 0.0025, "reward": 1.8235573172569275, "reward_std": 0.04111157916486263, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8235573768615723, "step": 2871 }, { "clip_ratio": 0.0, "completion_length": 268.390625, "epoch": 1.40283203125, "grad_norm": 1.91252386313284, "kl": 0.07470703125, "learning_rate": 6.494140625e-07, "loss": 0.003, "reward": 1.7103378772735596, "reward_std": 0.06228804960846901, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7103378474712372, "step": 2872 }, { "clip_ratio": 0.0, "completion_length": 301.828125, "epoch": 1.4033203125, "grad_norm": 1.1497181496044657, "kl": 0.088134765625, "learning_rate": 6.492919921875e-07, "loss": 0.0035, "reward": 1.75039541721344, "reward_std": 0.04632897302508354, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7503954172134399, "step": 2873 }, { "clip_ratio": 0.0, "completion_length": 263.125, "epoch": 1.40380859375, "grad_norm": 2.0387706691630783, "kl": 0.068115234375, "learning_rate": 6.49169921875e-07, "loss": 0.0027, "reward": 1.799646258354187, "reward_std": 0.09669975563883781, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.807458758354187, "step": 2874 }, { "clip_ratio": 0.0, "completion_length": 282.7890625, "epoch": 1.404296875, "grad_norm": 1.3450734188530216, "kl": 0.0771484375, "learning_rate": 6.490478515625e-07, "loss": 0.0031, "reward": 1.7853235602378845, "reward_std": 0.11038247868418694, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7931360602378845, "step": 2875 }, { "clip_ratio": 0.0, "completion_length": 332.9609375, "epoch": 1.40478515625, "grad_norm": 2.8171006789126753, "kl": 0.0574951171875, "learning_rate": 6.489257812499999e-07, "loss": 0.0023, "reward": 1.7852590084075928, "reward_std": 0.10786662250757217, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7930714786052704, "step": 2876 }, { "clip_ratio": 0.0, "completion_length": 328.9375, "epoch": 1.4052734375, "grad_norm": 7.972532719423657, "kl": 0.0927734375, "learning_rate": 6.488037109374999e-07, "loss": 0.0037, "reward": 1.6059449911117554, "reward_std": 0.0608881339430809, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.605944961309433, "step": 2877 }, { "clip_ratio": 0.0, "completion_length": 415.3203125, "epoch": 1.40576171875, "grad_norm": 1.0381545239399501, "kl": 0.062255859375, "learning_rate": 6.48681640625e-07, "loss": 0.0025, "reward": 1.7436492443084717, "reward_std": 0.10550978034734726, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7514617443084717, "step": 2878 }, { "clip_ratio": 0.0, "completion_length": 379.75, "epoch": 1.40625, "grad_norm": 0.9257238624924248, "kl": 0.0810546875, "learning_rate": 6.485595703125e-07, "loss": 0.0032, "reward": 1.7538402080535889, "reward_std": 0.08971455320715904, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7616527676582336, "step": 2879 }, { "clip_ratio": 0.0, "completion_length": 206.1796875, "epoch": 1.40673828125, "grad_norm": 2.4068440590607576, "kl": 0.083740234375, "learning_rate": 6.484375e-07, "loss": 0.0034, "reward": 1.7696812748908997, "reward_std": 0.05204281397163868, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7696812748908997, "step": 2880 }, { "clip_ratio": 0.0, "completion_length": 344.0703125, "epoch": 1.4072265625, "grad_norm": 3.6657345328091027, "kl": 0.0927734375, "learning_rate": 6.483154296875e-07, "loss": 0.0037, "reward": 1.7494273781776428, "reward_std": 0.12914881110191345, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7650523781776428, "step": 2881 }, { "clip_ratio": 0.0, "completion_length": 302.15625, "epoch": 1.40771484375, "grad_norm": 1.3100826271644412, "kl": 0.09375, "learning_rate": 6.48193359375e-07, "loss": 0.0038, "reward": 1.7687904238700867, "reward_std": 0.03958193212747574, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7687904834747314, "step": 2882 }, { "clip_ratio": 0.0, "completion_length": 258.21875, "epoch": 1.408203125, "grad_norm": 2.850476912346545, "kl": 0.0791015625, "learning_rate": 6.480712890624999e-07, "loss": 0.0032, "reward": 1.718224585056305, "reward_std": 0.07007079944014549, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7182245850563049, "step": 2883 }, { "clip_ratio": 0.0, "completion_length": 263.0703125, "epoch": 1.40869140625, "grad_norm": 2.224248079259704, "kl": 0.070556640625, "learning_rate": 6.479492187499999e-07, "loss": 0.0028, "reward": 1.81133633852005, "reward_std": 0.07499665580689907, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.81133633852005, "step": 2884 }, { "clip_ratio": 0.0, "completion_length": 302.9296875, "epoch": 1.4091796875, "grad_norm": 1.6913926305711484, "kl": 0.0560302734375, "learning_rate": 6.478271484375e-07, "loss": 0.0022, "reward": 1.7844181060791016, "reward_std": 0.1209041029214859, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7844181060791016, "step": 2885 }, { "clip_ratio": 0.0, "completion_length": 329.0625, "epoch": 1.40966796875, "grad_norm": 1.0647796466049049, "kl": 0.084228515625, "learning_rate": 6.47705078125e-07, "loss": 0.0034, "reward": 1.8228704333305359, "reward_std": 0.03462422825396061, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8228704631328583, "step": 2886 }, { "clip_ratio": 0.0, "completion_length": 308.7578125, "epoch": 1.41015625, "grad_norm": 1.4054822535501044, "kl": 0.059326171875, "learning_rate": 6.475830078125e-07, "loss": 0.0024, "reward": 1.7760130167007446, "reward_std": 0.0543476827442646, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.776013046503067, "step": 2887 }, { "clip_ratio": 0.0, "completion_length": 265.34375, "epoch": 1.41064453125, "grad_norm": 6.2835453355319535, "kl": 0.057373046875, "learning_rate": 6.474609375e-07, "loss": 0.0023, "reward": 1.8040868043899536, "reward_std": 0.046066829934716225, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8040868043899536, "step": 2888 }, { "clip_ratio": 0.0, "completion_length": 352.96875, "epoch": 1.4111328125, "grad_norm": 2.407813465668005, "kl": 0.069580078125, "learning_rate": 6.473388671874999e-07, "loss": 0.0028, "reward": 1.6971461772918701, "reward_std": 0.08162091299891472, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6971462368965149, "step": 2889 }, { "clip_ratio": 0.0, "completion_length": 282.9921875, "epoch": 1.41162109375, "grad_norm": 6.777877992014395, "kl": 0.068359375, "learning_rate": 6.472167968749999e-07, "loss": 0.0027, "reward": 1.7807137966156006, "reward_std": 0.07970836386084557, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7807137668132782, "step": 2890 }, { "clip_ratio": 0.0, "completion_length": 422.4453125, "epoch": 1.412109375, "grad_norm": 5.8315057276286675, "kl": 0.046142578125, "learning_rate": 6.470947265625e-07, "loss": 0.0018, "reward": 1.8434737920761108, "reward_std": 0.06651721894741058, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8512862920761108, "step": 2891 }, { "clip_ratio": 0.0, "completion_length": 446.265625, "epoch": 1.41259765625, "grad_norm": 1.3705583029994997, "kl": 0.0595703125, "learning_rate": 6.4697265625e-07, "loss": 0.0024, "reward": 1.6831304430961609, "reward_std": 0.22392578423023224, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7378179430961609, "step": 2892 }, { "clip_ratio": 0.0, "completion_length": 358.6796875, "epoch": 1.4130859375, "grad_norm": 0.8488028443872007, "kl": 0.066162109375, "learning_rate": 6.468505859375e-07, "loss": 0.0026, "reward": 1.738794982433319, "reward_std": 0.13992030546069145, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7700450122356415, "step": 2893 }, { "clip_ratio": 0.0, "completion_length": 322.5625, "epoch": 1.41357421875, "grad_norm": 1.069830600645475, "kl": 0.080322265625, "learning_rate": 6.46728515625e-07, "loss": 0.0032, "reward": 1.4707675576210022, "reward_std": 0.07445183768868446, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.5254550278186798, "step": 2894 }, { "clip_ratio": 0.0, "completion_length": 343.984375, "epoch": 1.4140625, "grad_norm": 1.2750408423135347, "kl": 0.060791015625, "learning_rate": 6.466064453125e-07, "loss": 0.0024, "reward": 1.770975410938263, "reward_std": 0.11516737192869186, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7866004109382629, "step": 2895 }, { "clip_ratio": 0.0, "completion_length": 400.359375, "epoch": 1.41455078125, "grad_norm": 2.2794674346865134, "kl": 0.064697265625, "learning_rate": 6.464843749999999e-07, "loss": 0.0026, "reward": 1.7922708988189697, "reward_std": 0.1690206415951252, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8235209882259369, "step": 2896 }, { "clip_ratio": 0.0, "completion_length": 273.75, "epoch": 1.4150390625, "grad_norm": 1.0310963228503536, "kl": 0.068603515625, "learning_rate": 6.463623046874999e-07, "loss": 0.0027, "reward": 1.8120849132537842, "reward_std": 0.06387075781822205, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8198973536491394, "step": 2897 }, { "clip_ratio": 0.0, "completion_length": 337.8046875, "epoch": 1.41552734375, "grad_norm": 0.941272249249113, "kl": 0.0628662109375, "learning_rate": 6.46240234375e-07, "loss": 0.0025, "reward": 1.6103965044021606, "reward_std": 0.13843106850981712, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6416464447975159, "step": 2898 }, { "clip_ratio": 0.0, "completion_length": 323.3359375, "epoch": 1.416015625, "grad_norm": 2.4628580027257514, "kl": 0.0589599609375, "learning_rate": 6.461181640625e-07, "loss": 0.0024, "reward": 1.822964370250702, "reward_std": 0.13363437354564667, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8464018404483795, "step": 2899 }, { "clip_ratio": 0.0, "completion_length": 311.5703125, "epoch": 1.41650390625, "grad_norm": 3.35939246403186, "kl": 0.07763671875, "learning_rate": 6.4599609375e-07, "loss": 0.0031, "reward": 1.8861233592033386, "reward_std": 0.17267528176307678, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8939358592033386, "step": 2900 }, { "clip_ratio": 0.0, "completion_length": 340.5, "epoch": 1.4169921875, "grad_norm": 1.678376818469578, "kl": 0.07861328125, "learning_rate": 6.458740234375e-07, "loss": 0.0031, "reward": 1.904150128364563, "reward_std": 0.03639446757733822, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9041501879692078, "step": 2901 }, { "clip_ratio": 0.0, "completion_length": 379.0625, "epoch": 1.41748046875, "grad_norm": 1.8009959754813598, "kl": 0.07958984375, "learning_rate": 6.457519531249999e-07, "loss": 0.0032, "reward": 1.738788664340973, "reward_std": 0.04321512393653393, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7387886047363281, "step": 2902 }, { "clip_ratio": 0.0, "completion_length": 261.2578125, "epoch": 1.41796875, "grad_norm": 1.751332386306868, "kl": 0.072998046875, "learning_rate": 6.456298828124999e-07, "loss": 0.0029, "reward": 1.8467352390289307, "reward_std": 0.05003441125154495, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8467352986335754, "step": 2903 }, { "clip_ratio": 0.0, "completion_length": 329.796875, "epoch": 1.41845703125, "grad_norm": 0.8841298672283215, "kl": 0.081298828125, "learning_rate": 6.455078125e-07, "loss": 0.0033, "reward": 1.6881967186927795, "reward_std": 0.09820759668946266, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7038217782974243, "step": 2904 }, { "clip_ratio": 0.0, "completion_length": 223.7265625, "epoch": 1.4189453125, "grad_norm": 2.2187376677871553, "kl": 0.083984375, "learning_rate": 6.453857421875e-07, "loss": 0.0034, "reward": 1.7173711061477661, "reward_std": 0.11199202761054039, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7251836061477661, "step": 2905 }, { "clip_ratio": 0.0, "completion_length": 306.765625, "epoch": 1.41943359375, "grad_norm": 1.677314384639223, "kl": 0.0687255859375, "learning_rate": 6.45263671875e-07, "loss": 0.0028, "reward": 1.7222880125045776, "reward_std": 0.08582095801830292, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7222879230976105, "step": 2906 }, { "clip_ratio": 0.0, "completion_length": 335.2578125, "epoch": 1.419921875, "grad_norm": 1.5234578995591637, "kl": 0.0589599609375, "learning_rate": 6.451416015625e-07, "loss": 0.0024, "reward": 1.786740779876709, "reward_std": 0.1870395466685295, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.8179908096790314, "step": 2907 }, { "clip_ratio": 0.0, "completion_length": 247.8046875, "epoch": 1.42041015625, "grad_norm": 3.9967357862014166, "kl": 0.069091796875, "learning_rate": 6.4501953125e-07, "loss": 0.0028, "reward": 1.7316583395004272, "reward_std": 0.0972440093755722, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7394708096981049, "step": 2908 }, { "clip_ratio": 0.0, "completion_length": 293.2890625, "epoch": 1.4208984375, "grad_norm": 0.9583192864484285, "kl": 0.056884765625, "learning_rate": 6.448974609374999e-07, "loss": 0.0023, "reward": 1.7740533947944641, "reward_std": 0.06026652827858925, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7740534543991089, "step": 2909 }, { "clip_ratio": 0.0, "completion_length": 256.1484375, "epoch": 1.42138671875, "grad_norm": 2.6735176863175707, "kl": 0.0640869140625, "learning_rate": 6.447753906249999e-07, "loss": 0.0026, "reward": 1.8210537433624268, "reward_std": 0.10449858009815216, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8288662135601044, "step": 2910 }, { "clip_ratio": 0.0, "completion_length": 316.5859375, "epoch": 1.421875, "grad_norm": 5.569660343086, "kl": 0.05224609375, "learning_rate": 6.446533203125e-07, "loss": 0.0021, "reward": 1.8247524499893188, "reward_std": 0.0780985876917839, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8325649201869965, "step": 2911 }, { "clip_ratio": 0.0, "completion_length": 250.1953125, "epoch": 1.42236328125, "grad_norm": 9.353451383129727, "kl": 0.068115234375, "learning_rate": 6.4453125e-07, "loss": 0.0027, "reward": 1.7248165011405945, "reward_std": 0.09427638724446297, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7248164713382721, "step": 2912 }, { "clip_ratio": 0.0, "completion_length": 321.3828125, "epoch": 1.4228515625, "grad_norm": 0.8980177691930714, "kl": 0.06640625, "learning_rate": 6.444091796875e-07, "loss": 0.0027, "reward": 1.817960262298584, "reward_std": 0.02281077764928341, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8179602324962616, "step": 2913 }, { "clip_ratio": 0.0, "completion_length": 321.5859375, "epoch": 1.42333984375, "grad_norm": 1.6111400955213642, "kl": 0.08740234375, "learning_rate": 6.44287109375e-07, "loss": 0.0035, "reward": 1.7714014053344727, "reward_std": 0.04745063558220863, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7714014053344727, "step": 2914 }, { "clip_ratio": 0.0, "completion_length": 203.0, "epoch": 1.423828125, "grad_norm": 0.9803662051320426, "kl": 0.081298828125, "learning_rate": 6.441650390625e-07, "loss": 0.0032, "reward": 1.8443069458007812, "reward_std": 0.044496684800833464, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8443069458007812, "step": 2915 }, { "clip_ratio": 0.0, "completion_length": 424.640625, "epoch": 1.42431640625, "grad_norm": 1.0830923513904844, "kl": 0.063232421875, "learning_rate": 6.440429687499999e-07, "loss": 0.0025, "reward": 1.739950180053711, "reward_std": 0.09381197765469551, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7477626502513885, "step": 2916 }, { "clip_ratio": 0.0, "completion_length": 294.0703125, "epoch": 1.4248046875, "grad_norm": 1.7415926590595752, "kl": 0.07080078125, "learning_rate": 6.439208984375e-07, "loss": 0.0028, "reward": 1.8552255630493164, "reward_std": 0.04862390458583832, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8552254736423492, "step": 2917 }, { "clip_ratio": 0.0, "completion_length": 255.3359375, "epoch": 1.42529296875, "grad_norm": 2.190010550447508, "kl": 0.0662841796875, "learning_rate": 6.43798828125e-07, "loss": 0.0026, "reward": 1.6964465975761414, "reward_std": 0.062107209116220474, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6964466571807861, "step": 2918 }, { "clip_ratio": 0.0, "completion_length": 345.234375, "epoch": 1.42578125, "grad_norm": 2.9783937153275017, "kl": 0.069091796875, "learning_rate": 6.436767578125e-07, "loss": 0.0028, "reward": 1.6416288614273071, "reward_std": 0.06529787369072437, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6494413912296295, "step": 2919 }, { "clip_ratio": 0.0, "completion_length": 340.375, "epoch": 1.42626953125, "grad_norm": 1.6599884910259533, "kl": 0.059326171875, "learning_rate": 6.435546875e-07, "loss": 0.0024, "reward": 1.7820017337799072, "reward_std": 0.06994332000613213, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7820016741752625, "step": 2920 }, { "clip_ratio": 0.0, "completion_length": 276.5390625, "epoch": 1.4267578125, "grad_norm": 0.839929372194412, "kl": 0.0582275390625, "learning_rate": 6.434326171875e-07, "loss": 0.0023, "reward": 1.812927007675171, "reward_std": 0.028376199770718813, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8129269778728485, "step": 2921 }, { "clip_ratio": 0.0, "completion_length": 310.4921875, "epoch": 1.42724609375, "grad_norm": 1.3813349416908838, "kl": 0.07763671875, "learning_rate": 6.433105468749999e-07, "loss": 0.0031, "reward": 1.560662865638733, "reward_std": 0.062494926154613495, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5606628656387329, "step": 2922 }, { "clip_ratio": 0.0, "completion_length": 254.6015625, "epoch": 1.427734375, "grad_norm": 1.9195445310350872, "kl": 0.064453125, "learning_rate": 6.431884765624999e-07, "loss": 0.0026, "reward": 1.7902184128761292, "reward_std": 0.07021540775895119, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7902184724807739, "step": 2923 }, { "clip_ratio": 0.0, "completion_length": 240.5546875, "epoch": 1.42822265625, "grad_norm": 1.9159652352107033, "kl": 0.1015625, "learning_rate": 6.4306640625e-07, "loss": 0.0041, "reward": 1.8406153321266174, "reward_std": 0.09906695038080215, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8406153321266174, "step": 2924 }, { "clip_ratio": 0.0, "completion_length": 329.8046875, "epoch": 1.4287109375, "grad_norm": 2.002785748157656, "kl": 0.06640625, "learning_rate": 6.429443359375e-07, "loss": 0.0027, "reward": 1.7776061296463013, "reward_std": 0.05872194468975067, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7776060402393341, "step": 2925 }, { "clip_ratio": 0.0, "completion_length": 285.3828125, "epoch": 1.42919921875, "grad_norm": 1.101408367276945, "kl": 0.0751953125, "learning_rate": 6.42822265625e-07, "loss": 0.003, "reward": 1.9210602045059204, "reward_std": 0.04052995890378952, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9210601449012756, "step": 2926 }, { "clip_ratio": 0.0, "completion_length": 257.109375, "epoch": 1.4296875, "grad_norm": 25.134180738645007, "kl": 0.06689453125, "learning_rate": 6.427001953125e-07, "loss": 0.0027, "reward": 1.780125081539154, "reward_std": 0.042955007404088974, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7801250517368317, "step": 2927 }, { "clip_ratio": 0.0, "completion_length": 247.625, "epoch": 1.43017578125, "grad_norm": 1.3853361093593575, "kl": 0.0748291015625, "learning_rate": 6.42578125e-07, "loss": 0.003, "reward": 1.8148647546768188, "reward_std": 0.05056310258805752, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8148646950721741, "step": 2928 }, { "clip_ratio": 0.0, "completion_length": 303.484375, "epoch": 1.4306640625, "grad_norm": 1.4127563383132502, "kl": 0.0538330078125, "learning_rate": 6.424560546874999e-07, "loss": 0.0022, "reward": 1.80906081199646, "reward_std": 0.05354410037398338, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8090607523918152, "step": 2929 }, { "clip_ratio": 0.0, "completion_length": 265.8671875, "epoch": 1.43115234375, "grad_norm": 0.9102013320719257, "kl": 0.0670166015625, "learning_rate": 6.42333984375e-07, "loss": 0.0027, "reward": 1.8222399950027466, "reward_std": 0.07436484284698963, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.822240024805069, "step": 2930 }, { "clip_ratio": 0.0, "completion_length": 301.015625, "epoch": 1.431640625, "grad_norm": 4.725685144329046, "kl": 0.083740234375, "learning_rate": 6.422119140625e-07, "loss": 0.0033, "reward": 1.7311421036720276, "reward_std": 0.11611544340848923, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7389545738697052, "step": 2931 }, { "clip_ratio": 0.0, "completion_length": 341.328125, "epoch": 1.43212890625, "grad_norm": 1.5563255627352082, "kl": 0.0810546875, "learning_rate": 6.4208984375e-07, "loss": 0.0032, "reward": 1.770340383052826, "reward_std": 0.07487385906279087, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7781528830528259, "step": 2932 }, { "clip_ratio": 0.0, "completion_length": 257.03125, "epoch": 1.4326171875, "grad_norm": 1.9586762224922618, "kl": 0.082763671875, "learning_rate": 6.419677734375e-07, "loss": 0.0033, "reward": 1.614130437374115, "reward_std": 0.025444690138101578, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.614130437374115, "step": 2933 }, { "clip_ratio": 0.0, "completion_length": 306.5390625, "epoch": 1.43310546875, "grad_norm": 1.4419175695980417, "kl": 0.080810546875, "learning_rate": 6.41845703125e-07, "loss": 0.0032, "reward": 1.8455346822738647, "reward_std": 0.20514215901494026, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8611597120761871, "step": 2934 }, { "clip_ratio": 0.0, "completion_length": 295.515625, "epoch": 1.43359375, "grad_norm": 1.993045779944748, "kl": 0.0830078125, "learning_rate": 6.417236328124999e-07, "loss": 0.0033, "reward": 1.7336124181747437, "reward_std": 0.10690167173743248, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7492374181747437, "step": 2935 }, { "clip_ratio": 0.0, "completion_length": 267.9609375, "epoch": 1.43408203125, "grad_norm": 1.283473748968174, "kl": 0.095458984375, "learning_rate": 6.416015624999999e-07, "loss": 0.0038, "reward": 1.7129297852516174, "reward_std": 0.0318203317001462, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7129298150539398, "step": 2936 }, { "clip_ratio": 0.0, "completion_length": 311.3046875, "epoch": 1.4345703125, "grad_norm": 1.1453748810817357, "kl": 0.0654296875, "learning_rate": 6.414794921875e-07, "loss": 0.0026, "reward": 1.6736072897911072, "reward_std": 0.08154010493308306, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6736072897911072, "step": 2937 }, { "clip_ratio": 0.0, "completion_length": 335.25, "epoch": 1.43505859375, "grad_norm": 2.3905058136733777, "kl": 0.0712890625, "learning_rate": 6.41357421875e-07, "loss": 0.0028, "reward": 1.7600257396697998, "reward_std": 0.10162025317549706, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7756507098674774, "step": 2938 }, { "clip_ratio": 0.0, "completion_length": 311.0234375, "epoch": 1.435546875, "grad_norm": 0.9852440812866968, "kl": 0.0693359375, "learning_rate": 6.412353515625e-07, "loss": 0.0028, "reward": 1.7784386277198792, "reward_std": 0.09795338660478592, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8018760681152344, "step": 2939 }, { "clip_ratio": 0.0, "completion_length": 289.390625, "epoch": 1.43603515625, "grad_norm": 5.526089374651416, "kl": 0.0697021484375, "learning_rate": 6.4111328125e-07, "loss": 0.0028, "reward": 1.8081418871879578, "reward_std": 0.05436134152114391, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.808141827583313, "step": 2940 }, { "clip_ratio": 0.0, "completion_length": 258.7734375, "epoch": 1.4365234375, "grad_norm": 1.4939649863180227, "kl": 0.0582275390625, "learning_rate": 6.409912109375e-07, "loss": 0.0023, "reward": 1.9053468704223633, "reward_std": 0.05440020468086004, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9053468704223633, "step": 2941 }, { "clip_ratio": 0.0, "completion_length": 283.109375, "epoch": 1.43701171875, "grad_norm": 1.8121729150206805, "kl": 0.074951171875, "learning_rate": 6.408691406249999e-07, "loss": 0.003, "reward": 1.762609839439392, "reward_std": 0.025970693212002516, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7626098394393921, "step": 2942 }, { "clip_ratio": 0.0, "completion_length": 294.5859375, "epoch": 1.4375, "grad_norm": 0.9192342581628401, "kl": 0.0701904296875, "learning_rate": 6.407470703125e-07, "loss": 0.0028, "reward": 1.6859044432640076, "reward_std": 0.040243260096758604, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6859044134616852, "step": 2943 }, { "clip_ratio": 0.0, "completion_length": 235.3671875, "epoch": 1.43798828125, "grad_norm": 2.475395126244144, "kl": 0.08544921875, "learning_rate": 6.40625e-07, "loss": 0.0034, "reward": 1.848134458065033, "reward_std": 0.0467034000903368, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.848134458065033, "step": 2944 }, { "clip_ratio": 0.0, "completion_length": 229.3828125, "epoch": 1.4384765625, "grad_norm": 3.6098224066349096, "kl": 0.075439453125, "learning_rate": 6.405029296875e-07, "loss": 0.003, "reward": 1.812850534915924, "reward_std": 0.031669266521930695, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8128505349159241, "step": 2945 }, { "clip_ratio": 0.0, "completion_length": 307.2109375, "epoch": 1.43896484375, "grad_norm": 2.222659762962882, "kl": 0.07421875, "learning_rate": 6.40380859375e-07, "loss": 0.003, "reward": 1.7910358309745789, "reward_std": 0.05065750889480114, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7910358011722565, "step": 2946 }, { "clip_ratio": 0.0, "completion_length": 326.6484375, "epoch": 1.439453125, "grad_norm": 2.208671012693913, "kl": 0.10302734375, "learning_rate": 6.402587890625e-07, "loss": 0.0041, "reward": 1.6763520240783691, "reward_std": 0.058261996135115623, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6763519942760468, "step": 2947 }, { "clip_ratio": 0.0, "completion_length": 259.3515625, "epoch": 1.43994140625, "grad_norm": 1.5352653822354414, "kl": 0.063232421875, "learning_rate": 6.401367187499999e-07, "loss": 0.0025, "reward": 1.755962073802948, "reward_std": 0.03862538933753967, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7559620141983032, "step": 2948 }, { "clip_ratio": 0.0, "completion_length": 315.7265625, "epoch": 1.4404296875, "grad_norm": 0.698355831200674, "kl": 0.06982421875, "learning_rate": 6.400146484374999e-07, "loss": 0.0028, "reward": 1.7664831280708313, "reward_std": 0.020398199558258057, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7664831578731537, "step": 2949 }, { "clip_ratio": 0.0, "completion_length": 258.8359375, "epoch": 1.44091796875, "grad_norm": 2.578363130813962, "kl": 0.080810546875, "learning_rate": 6.39892578125e-07, "loss": 0.0032, "reward": 1.7696714997291565, "reward_std": 0.051741763949394226, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7696714997291565, "step": 2950 }, { "clip_ratio": 0.0, "completion_length": 333.1875, "epoch": 1.44140625, "grad_norm": 6.744614815262741, "kl": 0.070068359375, "learning_rate": 6.397705078125e-07, "loss": 0.0028, "reward": 1.787192463874817, "reward_std": 0.030791327357292175, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7871924936771393, "step": 2951 }, { "clip_ratio": 0.0, "completion_length": 353.90625, "epoch": 1.44189453125, "grad_norm": 0.853256922168278, "kl": 0.0648193359375, "learning_rate": 6.396484375e-07, "loss": 0.0026, "reward": 1.7069947719573975, "reward_std": 0.10049226693809032, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7226196825504303, "step": 2952 }, { "clip_ratio": 0.0, "completion_length": 378.875, "epoch": 1.4423828125, "grad_norm": 1.7414803959752738, "kl": 0.0633544921875, "learning_rate": 6.395263671875e-07, "loss": 0.0025, "reward": 1.7784687280654907, "reward_std": 0.03222686983644962, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7784686982631683, "step": 2953 }, { "clip_ratio": 0.0, "completion_length": 242.796875, "epoch": 1.44287109375, "grad_norm": 0.7824543197588109, "kl": 0.065185546875, "learning_rate": 6.39404296875e-07, "loss": 0.0026, "reward": 1.835627555847168, "reward_std": 0.07133413106203079, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.835627555847168, "step": 2954 }, { "clip_ratio": 0.0, "completion_length": 259.2421875, "epoch": 1.443359375, "grad_norm": 1.005077756695292, "kl": 0.0574951171875, "learning_rate": 6.392822265624999e-07, "loss": 0.0023, "reward": 1.907008171081543, "reward_std": 0.12066750600934029, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.9148207008838654, "step": 2955 }, { "clip_ratio": 0.0, "completion_length": 459.796875, "epoch": 1.44384765625, "grad_norm": 1.6630553458065824, "kl": 0.05712890625, "learning_rate": 6.391601562499999e-07, "loss": 0.0023, "reward": 1.6454344391822815, "reward_std": 0.130395095795393, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6923094987869263, "step": 2956 }, { "clip_ratio": 0.0, "completion_length": 310.1875, "epoch": 1.4443359375, "grad_norm": 1.6919739972247043, "kl": 0.080078125, "learning_rate": 6.390380859375e-07, "loss": 0.0032, "reward": 1.7030593156814575, "reward_std": 0.11804335564374924, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7343093156814575, "step": 2957 }, { "clip_ratio": 0.0, "completion_length": 299.5, "epoch": 1.44482421875, "grad_norm": 3.0488067627667186, "kl": 0.0574951171875, "learning_rate": 6.38916015625e-07, "loss": 0.0023, "reward": 1.7935433387756348, "reward_std": 0.11342027597129345, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8091683387756348, "step": 2958 }, { "clip_ratio": 0.0, "completion_length": 255.875, "epoch": 1.4453125, "grad_norm": 3.417146010971531, "kl": 0.08154296875, "learning_rate": 6.387939453125e-07, "loss": 0.0033, "reward": 1.7204725742340088, "reward_std": 0.10613266006112099, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7282850742340088, "step": 2959 }, { "clip_ratio": 0.0, "completion_length": 285.0078125, "epoch": 1.44580078125, "grad_norm": 7.369282649351467, "kl": 0.06591796875, "learning_rate": 6.38671875e-07, "loss": 0.0026, "reward": 1.7681906819343567, "reward_std": 0.02635895786806941, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7681907117366791, "step": 2960 }, { "clip_ratio": 0.0, "completion_length": 318.953125, "epoch": 1.4462890625, "grad_norm": 2.0658434347932264, "kl": 0.0645751953125, "learning_rate": 6.385498046874999e-07, "loss": 0.0026, "reward": 1.817894995212555, "reward_std": 0.07758408039808273, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8178950250148773, "step": 2961 }, { "clip_ratio": 0.0, "completion_length": 286.9765625, "epoch": 1.44677734375, "grad_norm": 0.6569603767694976, "kl": 0.0533447265625, "learning_rate": 6.384277343749999e-07, "loss": 0.0021, "reward": 1.8447397351264954, "reward_std": 0.06235711555927992, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8525522649288177, "step": 2962 }, { "clip_ratio": 0.0, "completion_length": 326.3125, "epoch": 1.447265625, "grad_norm": 2.645560581989775, "kl": 0.072265625, "learning_rate": 6.383056640625e-07, "loss": 0.0029, "reward": 1.687853217124939, "reward_std": 0.0805647261440754, "rewards/format_reward": 0.9296875, "rewards/ocr_reward": 0.758165717124939, "step": 2963 }, { "clip_ratio": 0.0, "completion_length": 380.5859375, "epoch": 1.44775390625, "grad_norm": 1.3650167038051417, "kl": 0.0523681640625, "learning_rate": 6.3818359375e-07, "loss": 0.0021, "reward": 1.7391371130943298, "reward_std": 0.15477406233549118, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7860120832920074, "step": 2964 }, { "clip_ratio": 0.0, "completion_length": 291.6328125, "epoch": 1.4482421875, "grad_norm": 1.1245251287679778, "kl": 0.072998046875, "learning_rate": 6.380615234375e-07, "loss": 0.0029, "reward": 1.646054744720459, "reward_std": 0.06316448841243982, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6538672149181366, "step": 2965 }, { "clip_ratio": 0.0, "completion_length": 334.5546875, "epoch": 1.44873046875, "grad_norm": 13.043523585623863, "kl": 0.0791015625, "learning_rate": 6.37939453125e-07, "loss": 0.0032, "reward": 1.7939326763153076, "reward_std": 0.06745261326432228, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8017451465129852, "step": 2966 }, { "clip_ratio": 0.0, "completion_length": 309.359375, "epoch": 1.44921875, "grad_norm": 1.57791062011593, "kl": 0.060302734375, "learning_rate": 6.378173828125e-07, "loss": 0.0024, "reward": 1.8393926620483398, "reward_std": 0.018147557973861694, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8393926918506622, "step": 2967 }, { "clip_ratio": 0.0, "completion_length": 316.3359375, "epoch": 1.44970703125, "grad_norm": 1.8175153858800717, "kl": 0.06005859375, "learning_rate": 6.376953124999999e-07, "loss": 0.0024, "reward": 1.7290653586387634, "reward_std": 0.12614280730485916, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7446902990341187, "step": 2968 }, { "clip_ratio": 0.0, "completion_length": 300.9296875, "epoch": 1.4501953125, "grad_norm": 9.465224240232326, "kl": 0.0694580078125, "learning_rate": 6.375732421874999e-07, "loss": 0.0028, "reward": 1.7350887060165405, "reward_std": 0.11684410274028778, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7429012060165405, "step": 2969 }, { "clip_ratio": 0.0, "completion_length": 412.9765625, "epoch": 1.45068359375, "grad_norm": 2.083144051979017, "kl": 0.061279296875, "learning_rate": 6.37451171875e-07, "loss": 0.0024, "reward": 1.7892062067985535, "reward_std": 0.12466869875788689, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8048312067985535, "step": 2970 }, { "clip_ratio": 0.0, "completion_length": 374.1796875, "epoch": 1.451171875, "grad_norm": 3.731969056408829, "kl": 0.06591796875, "learning_rate": 6.373291015625e-07, "loss": 0.0026, "reward": 1.8324419260025024, "reward_std": 0.07397226989269257, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8324419260025024, "step": 2971 }, { "clip_ratio": 0.0, "completion_length": 328.8515625, "epoch": 1.45166015625, "grad_norm": 2.1941261882702006, "kl": 0.0714111328125, "learning_rate": 6.3720703125e-07, "loss": 0.0029, "reward": 1.7780184149742126, "reward_std": 0.20118620991706848, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8014559149742126, "step": 2972 }, { "clip_ratio": 0.0, "completion_length": 247.21875, "epoch": 1.4521484375, "grad_norm": 1.3453507107528788, "kl": 0.070556640625, "learning_rate": 6.370849609375e-07, "loss": 0.0028, "reward": 1.6615075469017029, "reward_std": 0.01416647876612842, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6615075469017029, "step": 2973 }, { "clip_ratio": 0.0, "completion_length": 301.203125, "epoch": 1.45263671875, "grad_norm": 0.941837624396483, "kl": 0.071533203125, "learning_rate": 6.369628906249999e-07, "loss": 0.0029, "reward": 1.7576044797897339, "reward_std": 0.030522312968969345, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7576044797897339, "step": 2974 }, { "clip_ratio": 0.0, "completion_length": 289.1953125, "epoch": 1.453125, "grad_norm": 2.6111080609725876, "kl": 0.063720703125, "learning_rate": 6.368408203124999e-07, "loss": 0.0025, "reward": 1.748826265335083, "reward_std": 0.040242042392492294, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.748826265335083, "step": 2975 }, { "clip_ratio": 0.0, "completion_length": 263.75, "epoch": 1.45361328125, "grad_norm": 9.758378083342457, "kl": 0.08203125, "learning_rate": 6.3671875e-07, "loss": 0.0033, "reward": 1.7325801849365234, "reward_std": 0.1269008917734027, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7560176849365234, "step": 2976 }, { "clip_ratio": 0.0, "completion_length": 384.34375, "epoch": 1.4541015625, "grad_norm": 1.4790068606433127, "kl": 0.05712890625, "learning_rate": 6.365966796875e-07, "loss": 0.0023, "reward": 1.7784000039100647, "reward_std": 0.11495145037770271, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7862125039100647, "step": 2977 }, { "clip_ratio": 0.0, "completion_length": 323.5234375, "epoch": 1.45458984375, "grad_norm": 7.445604414954171, "kl": 0.090087890625, "learning_rate": 6.36474609375e-07, "loss": 0.0036, "reward": 1.7594309449195862, "reward_std": 0.05727781727910042, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7594309449195862, "step": 2978 }, { "clip_ratio": 0.0, "completion_length": 308.8984375, "epoch": 1.455078125, "grad_norm": 1.9969872662674215, "kl": 0.0635986328125, "learning_rate": 6.363525390625e-07, "loss": 0.0025, "reward": 1.8546399474143982, "reward_std": 0.03894917480647564, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8546398878097534, "step": 2979 }, { "clip_ratio": 0.0, "completion_length": 295.3984375, "epoch": 1.45556640625, "grad_norm": 2.774830461991803, "kl": 0.086669921875, "learning_rate": 6.3623046875e-07, "loss": 0.0035, "reward": 1.7985565066337585, "reward_std": 0.09453297778964043, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8063690066337585, "step": 2980 }, { "clip_ratio": 0.0, "completion_length": 282.734375, "epoch": 1.4560546875, "grad_norm": 1.005235350260796, "kl": 0.067626953125, "learning_rate": 6.361083984374999e-07, "loss": 0.0027, "reward": 1.8525811433792114, "reward_std": 0.032127720303833485, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.852581113576889, "step": 2981 }, { "clip_ratio": 0.0, "completion_length": 380.15625, "epoch": 1.45654296875, "grad_norm": 1.480660541121175, "kl": 0.06591796875, "learning_rate": 6.359863281249999e-07, "loss": 0.0026, "reward": 1.8015734553337097, "reward_std": 0.10697927977889776, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8093858957290649, "step": 2982 }, { "clip_ratio": 0.0, "completion_length": 266.6484375, "epoch": 1.45703125, "grad_norm": 1.28090728544506, "kl": 0.090087890625, "learning_rate": 6.358642578125e-07, "loss": 0.0036, "reward": 1.7251802682876587, "reward_std": 0.02976925577968359, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7251803278923035, "step": 2983 }, { "clip_ratio": 0.0, "completion_length": 358.25, "epoch": 1.45751953125, "grad_norm": 1.5160356660704382, "kl": 0.0509033203125, "learning_rate": 6.357421875e-07, "loss": 0.002, "reward": 1.7424423694610596, "reward_std": 0.10370543040335178, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7502549290657043, "step": 2984 }, { "clip_ratio": 0.0, "completion_length": 365.1875, "epoch": 1.4580078125, "grad_norm": 1.2745109824108936, "kl": 0.06298828125, "learning_rate": 6.356201171875e-07, "loss": 0.0025, "reward": 1.5927727818489075, "reward_std": 0.12405483797192574, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6318353116512299, "step": 2985 }, { "clip_ratio": 0.0, "completion_length": 364.640625, "epoch": 1.45849609375, "grad_norm": 2.169138878887581, "kl": 0.0712890625, "learning_rate": 6.35498046875e-07, "loss": 0.0029, "reward": 1.860952913761139, "reward_std": 0.06300730584189296, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8609528839588165, "step": 2986 }, { "clip_ratio": 0.0, "completion_length": 322.421875, "epoch": 1.458984375, "grad_norm": 1.6032586983368817, "kl": 0.074462890625, "learning_rate": 6.353759765624999e-07, "loss": 0.003, "reward": 1.6849730610847473, "reward_std": 0.04362546745687723, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6849730312824249, "step": 2987 }, { "clip_ratio": 0.0, "completion_length": 174.3125, "epoch": 1.45947265625, "grad_norm": 2.296346852512009, "kl": 0.0687255859375, "learning_rate": 6.352539062499999e-07, "loss": 0.0028, "reward": 1.8462890982627869, "reward_std": 0.11591282114386559, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8541015088558197, "step": 2988 }, { "clip_ratio": 0.0, "completion_length": 331.3125, "epoch": 1.4599609375, "grad_norm": 4.792249158952244, "kl": 0.067626953125, "learning_rate": 6.351318359375e-07, "loss": 0.0027, "reward": 1.6084554195404053, "reward_std": 0.08818965405225754, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6162679195404053, "step": 2989 }, { "clip_ratio": 0.0, "completion_length": 321.078125, "epoch": 1.46044921875, "grad_norm": 0.4828418747479682, "kl": 0.05810546875, "learning_rate": 6.35009765625e-07, "loss": 0.0023, "reward": 1.6002402305603027, "reward_std": 0.12684646097477525, "rewards/format_reward": 0.9375, "rewards/ocr_reward": 0.6627402305603027, "step": 2990 }, { "clip_ratio": 0.0, "completion_length": 387.5078125, "epoch": 1.4609375, "grad_norm": 1.515503324615265, "kl": 0.0631103515625, "learning_rate": 6.348876953125e-07, "loss": 0.0025, "reward": 1.8527971506118774, "reward_std": 0.07764232903718948, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.852797120809555, "step": 2991 }, { "clip_ratio": 0.0, "completion_length": 383.03125, "epoch": 1.46142578125, "grad_norm": 1.2235439862774495, "kl": 0.0574951171875, "learning_rate": 6.34765625e-07, "loss": 0.0023, "reward": 1.7806990146636963, "reward_std": 0.05695920065045357, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7885115146636963, "step": 2992 }, { "clip_ratio": 0.0, "completion_length": 364.25, "epoch": 1.4619140625, "grad_norm": 3.314536987656733, "kl": 0.0587158203125, "learning_rate": 6.346435546875e-07, "loss": 0.0023, "reward": 1.8154310584068298, "reward_std": 0.12618440762162209, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8310559988021851, "step": 2993 }, { "clip_ratio": 0.0, "completion_length": 289.0390625, "epoch": 1.46240234375, "grad_norm": 1.2908463477638215, "kl": 0.0565185546875, "learning_rate": 6.345214843749999e-07, "loss": 0.0023, "reward": 1.7146747708320618, "reward_std": 0.07501043565571308, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7224873006343842, "step": 2994 }, { "clip_ratio": 0.0, "completion_length": 393.8984375, "epoch": 1.462890625, "grad_norm": 2.9125890422786016, "kl": 0.072021484375, "learning_rate": 6.343994140624999e-07, "loss": 0.0029, "reward": 1.6301099061965942, "reward_std": 0.13861995935440063, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6535473763942719, "step": 2995 }, { "clip_ratio": 0.0, "completion_length": 259.2734375, "epoch": 1.46337890625, "grad_norm": 0.8886604709538318, "kl": 0.05078125, "learning_rate": 6.3427734375e-07, "loss": 0.002, "reward": 1.7734524011611938, "reward_std": 0.020457894541323185, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7734524309635162, "step": 2996 }, { "clip_ratio": 0.0, "completion_length": 301.0546875, "epoch": 1.4638671875, "grad_norm": 1.0464030461396734, "kl": 0.0601806640625, "learning_rate": 6.341552734375e-07, "loss": 0.0024, "reward": 1.654776692390442, "reward_std": 0.08548066765069962, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6704016923904419, "step": 2997 }, { "clip_ratio": 0.0, "completion_length": 288.515625, "epoch": 1.46435546875, "grad_norm": 9.606833597314674, "kl": 0.059326171875, "learning_rate": 6.34033203125e-07, "loss": 0.0024, "reward": 1.8002928495407104, "reward_std": 0.06611186265945435, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8002929091453552, "step": 2998 }, { "clip_ratio": 0.0, "completion_length": 342.0546875, "epoch": 1.46484375, "grad_norm": 1.0930533697553297, "kl": 0.056640625, "learning_rate": 6.339111328125e-07, "loss": 0.0023, "reward": 1.7426326274871826, "reward_std": 0.0601738141849637, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.742632657289505, "step": 2999 }, { "clip_ratio": 0.0, "completion_length": 326.921875, "epoch": 1.46533203125, "grad_norm": 0.9165980219862101, "kl": 0.0672607421875, "learning_rate": 6.337890625e-07, "loss": 0.0027, "reward": 1.829143762588501, "reward_std": 0.051721951458603144, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.836956262588501, "step": 3000 }, { "clip_ratio": 0.0, "completion_length": 281.5625, "epoch": 1.4658203125, "grad_norm": 3.771010388411293, "kl": 0.09716796875, "learning_rate": 6.336669921874999e-07, "loss": 0.0039, "reward": 1.746791124343872, "reward_std": 0.0833306573331356, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7546035051345825, "step": 3001 }, { "clip_ratio": 0.0, "completion_length": 307.3515625, "epoch": 1.46630859375, "grad_norm": 1.7568136419799434, "kl": 0.079345703125, "learning_rate": 6.33544921875e-07, "loss": 0.0032, "reward": 1.8903692960739136, "reward_std": 0.08128884993493557, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8981817662715912, "step": 3002 }, { "clip_ratio": 0.0, "completion_length": 237.3359375, "epoch": 1.466796875, "grad_norm": 0.6670166215082387, "kl": 0.0595703125, "learning_rate": 6.334228515625e-07, "loss": 0.0024, "reward": 1.7217431664466858, "reward_std": 0.052741317078471184, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7217431664466858, "step": 3003 }, { "clip_ratio": 0.0, "completion_length": 341.8359375, "epoch": 1.46728515625, "grad_norm": 0.5130515259710517, "kl": 0.0673828125, "learning_rate": 6.3330078125e-07, "loss": 0.0027, "reward": 1.747616171836853, "reward_std": 0.09704152680933475, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7632412910461426, "step": 3004 }, { "clip_ratio": 0.0, "completion_length": 308.2578125, "epoch": 1.4677734375, "grad_norm": 1.4447424507562618, "kl": 0.0552978515625, "learning_rate": 6.331787109375e-07, "loss": 0.0022, "reward": 1.767389953136444, "reward_std": 0.052734846249222755, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7673899531364441, "step": 3005 }, { "clip_ratio": 0.0, "completion_length": 344.390625, "epoch": 1.46826171875, "grad_norm": 2.541777603017261, "kl": 0.07666015625, "learning_rate": 6.33056640625e-07, "loss": 0.0031, "reward": 1.813210904598236, "reward_std": 0.11053607612848282, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8210234045982361, "step": 3006 }, { "clip_ratio": 0.0, "completion_length": 370.0, "epoch": 1.46875, "grad_norm": 0.8804910586643099, "kl": 0.0538330078125, "learning_rate": 6.329345703124999e-07, "loss": 0.0022, "reward": 1.8383709192276, "reward_std": 0.02365578804165125, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8383709490299225, "step": 3007 }, { "clip_ratio": 0.0, "completion_length": 331.1484375, "epoch": 1.46923828125, "grad_norm": 2.8323385895385247, "kl": 0.066162109375, "learning_rate": 6.328124999999999e-07, "loss": 0.0026, "reward": 1.7967005968093872, "reward_std": 0.07634428888559341, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8045131862163544, "step": 3008 }, { "clip_ratio": 0.0, "completion_length": 244.4921875, "epoch": 1.4697265625, "grad_norm": 1.558692224561045, "kl": 0.09521484375, "learning_rate": 6.326904296875e-07, "loss": 0.0038, "reward": 1.6224290132522583, "reward_std": 0.04988163709640503, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6224290430545807, "step": 3009 }, { "clip_ratio": 0.0, "completion_length": 352.375, "epoch": 1.47021484375, "grad_norm": 0.6934292087860076, "kl": 0.0556640625, "learning_rate": 6.32568359375e-07, "loss": 0.0022, "reward": 1.9525578022003174, "reward_std": 0.031788173131644726, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9525578618049622, "step": 3010 }, { "clip_ratio": 0.0, "completion_length": 373.7109375, "epoch": 1.470703125, "grad_norm": 1.1566353452168543, "kl": 0.0509033203125, "learning_rate": 6.324462890625e-07, "loss": 0.002, "reward": 1.80779629945755, "reward_std": 0.13630902767181396, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.83123379945755, "step": 3011 }, { "clip_ratio": 0.0, "completion_length": 191.28125, "epoch": 1.47119140625, "grad_norm": 1.4657246132330377, "kl": 0.0772705078125, "learning_rate": 6.3232421875e-07, "loss": 0.0031, "reward": 1.868379831314087, "reward_std": 0.05622401461005211, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8683798313140869, "step": 3012 }, { "clip_ratio": 0.0, "completion_length": 325.09375, "epoch": 1.4716796875, "grad_norm": 0.8995301250346504, "kl": 0.0518798828125, "learning_rate": 6.322021484375e-07, "loss": 0.0021, "reward": 1.7831536531448364, "reward_std": 0.03840099833905697, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7831536531448364, "step": 3013 }, { "clip_ratio": 0.0, "completion_length": 304.7578125, "epoch": 1.47216796875, "grad_norm": 1.064413517900183, "kl": 0.0673828125, "learning_rate": 6.320800781249999e-07, "loss": 0.0027, "reward": 1.8467872142791748, "reward_std": 0.06419426389038563, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8545996248722076, "step": 3014 }, { "clip_ratio": 0.0, "completion_length": 288.8203125, "epoch": 1.47265625, "grad_norm": 1.2422987199959086, "kl": 0.0726318359375, "learning_rate": 6.319580078125e-07, "loss": 0.0029, "reward": 1.8063626289367676, "reward_std": 0.14462891966104507, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8141750693321228, "step": 3015 }, { "clip_ratio": 0.0, "completion_length": 288.953125, "epoch": 1.47314453125, "grad_norm": 2.517009818019777, "kl": 0.09033203125, "learning_rate": 6.318359375e-07, "loss": 0.0036, "reward": 1.653084933757782, "reward_std": 0.098308514803648, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6530849635601044, "step": 3016 }, { "clip_ratio": 0.0, "completion_length": 265.8203125, "epoch": 1.4736328125, "grad_norm": 2.340141995970886, "kl": 0.0723876953125, "learning_rate": 6.317138671875e-07, "loss": 0.0029, "reward": 1.7496492862701416, "reward_std": 0.1289630625396967, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7496492564678192, "step": 3017 }, { "clip_ratio": 0.0, "completion_length": 400.96875, "epoch": 1.47412109375, "grad_norm": 1.256597280828511, "kl": 0.054443359375, "learning_rate": 6.31591796875e-07, "loss": 0.0022, "reward": 1.8092172145843506, "reward_std": 0.0746869370341301, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8170297741889954, "step": 3018 }, { "clip_ratio": 0.0, "completion_length": 249.609375, "epoch": 1.474609375, "grad_norm": 2.4683338443131673, "kl": 0.074951171875, "learning_rate": 6.314697265625e-07, "loss": 0.003, "reward": 1.6483544707298279, "reward_std": 0.10350741818547249, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6561670005321503, "step": 3019 }, { "clip_ratio": 0.0, "completion_length": 278.5390625, "epoch": 1.47509765625, "grad_norm": 3.6247431205126874, "kl": 0.0791015625, "learning_rate": 6.313476562499999e-07, "loss": 0.0032, "reward": 1.7551026344299316, "reward_std": 0.11083749681711197, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.755102664232254, "step": 3020 }, { "clip_ratio": 0.0, "completion_length": 309.40625, "epoch": 1.4755859375, "grad_norm": 1.0766199180438065, "kl": 0.07763671875, "learning_rate": 6.312255859374999e-07, "loss": 0.0031, "reward": 1.7589967250823975, "reward_std": 0.05489533022046089, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7589967548847198, "step": 3021 }, { "clip_ratio": 0.0, "completion_length": 330.65625, "epoch": 1.47607421875, "grad_norm": 1.4208095059281178, "kl": 0.0693359375, "learning_rate": 6.31103515625e-07, "loss": 0.0028, "reward": 1.8012661933898926, "reward_std": 0.045568812638521194, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8012661635875702, "step": 3022 }, { "clip_ratio": 0.0, "completion_length": 259.28125, "epoch": 1.4765625, "grad_norm": 3.075879562177541, "kl": 0.0670166015625, "learning_rate": 6.309814453125e-07, "loss": 0.0027, "reward": 1.870323121547699, "reward_std": 0.03863493725657463, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.870323121547699, "step": 3023 }, { "clip_ratio": 0.0, "completion_length": 294.3671875, "epoch": 1.47705078125, "grad_norm": 6.548224072360138, "kl": 0.064208984375, "learning_rate": 6.30859375e-07, "loss": 0.0026, "reward": 1.7646169662475586, "reward_std": 0.07329913601279259, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7646169662475586, "step": 3024 }, { "clip_ratio": 0.0, "completion_length": 308.3203125, "epoch": 1.4775390625, "grad_norm": 3.4394236221792247, "kl": 0.081787109375, "learning_rate": 6.307373046875e-07, "loss": 0.0033, "reward": 1.6831781268119812, "reward_std": 0.07777292281389236, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6909905970096588, "step": 3025 }, { "clip_ratio": 0.0, "completion_length": 293.390625, "epoch": 1.47802734375, "grad_norm": 1.4825730777624018, "kl": 0.068359375, "learning_rate": 6.30615234375e-07, "loss": 0.0027, "reward": 1.7502532005310059, "reward_std": 0.055065859109163284, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7502532303333282, "step": 3026 }, { "clip_ratio": 0.0, "completion_length": 264.078125, "epoch": 1.478515625, "grad_norm": 1.5490192480578866, "kl": 0.07470703125, "learning_rate": 6.304931640624999e-07, "loss": 0.003, "reward": 1.7950791120529175, "reward_std": 0.038502528332173824, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7950790822505951, "step": 3027 }, { "clip_ratio": 0.0, "completion_length": 222.9375, "epoch": 1.47900390625, "grad_norm": 7.667208022460056, "kl": 0.07861328125, "learning_rate": 6.3037109375e-07, "loss": 0.0031, "reward": 1.8664205074310303, "reward_std": 0.07143169827759266, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8664205372333527, "step": 3028 }, { "clip_ratio": 0.0, "completion_length": 303.2421875, "epoch": 1.4794921875, "grad_norm": 1.562849631053882, "kl": 0.096923828125, "learning_rate": 6.302490234375e-07, "loss": 0.0039, "reward": 1.6895395517349243, "reward_std": 0.03657793905586004, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6895396113395691, "step": 3029 }, { "clip_ratio": 0.0, "completion_length": 336.4609375, "epoch": 1.47998046875, "grad_norm": 2.072472095377072, "kl": 0.073974609375, "learning_rate": 6.30126953125e-07, "loss": 0.003, "reward": 1.6448410749435425, "reward_std": 0.12514834105968475, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6526535749435425, "step": 3030 }, { "clip_ratio": 0.0, "completion_length": 355.90625, "epoch": 1.48046875, "grad_norm": 4.248343394614419, "kl": 0.082763671875, "learning_rate": 6.300048828125e-07, "loss": 0.0033, "reward": 1.676950991153717, "reward_std": 0.05259130522608757, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.676950991153717, "step": 3031 }, { "clip_ratio": 0.0, "completion_length": 246.3515625, "epoch": 1.48095703125, "grad_norm": 10.016521585655443, "kl": 0.0672607421875, "learning_rate": 6.298828125e-07, "loss": 0.0027, "reward": 1.855322241783142, "reward_std": 0.06691266316920519, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8553222417831421, "step": 3032 }, { "clip_ratio": 0.0, "completion_length": 211.3125, "epoch": 1.4814453125, "grad_norm": 1.7764278531475133, "kl": 0.056884765625, "learning_rate": 6.297607421874999e-07, "loss": 0.0023, "reward": 1.932866632938385, "reward_std": 0.05053331330418587, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.932866632938385, "step": 3033 }, { "clip_ratio": 0.0, "completion_length": 418.2890625, "epoch": 1.48193359375, "grad_norm": 2.0240030324030367, "kl": 0.0572509765625, "learning_rate": 6.296386718749999e-07, "loss": 0.0023, "reward": 1.6182212233543396, "reward_std": 0.2208278402686119, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.67290860414505, "step": 3034 }, { "clip_ratio": 0.0, "completion_length": 332.5859375, "epoch": 1.482421875, "grad_norm": 1.713898440746238, "kl": 0.0733642578125, "learning_rate": 6.295166015625e-07, "loss": 0.0029, "reward": 1.7418071627616882, "reward_std": 0.10038780607283115, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7496196925640106, "step": 3035 }, { "clip_ratio": 0.0, "completion_length": 285.8125, "epoch": 1.48291015625, "grad_norm": 0.9264185253043364, "kl": 0.075439453125, "learning_rate": 6.2939453125e-07, "loss": 0.003, "reward": 1.741519808769226, "reward_std": 0.07616345398128033, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7493323385715485, "step": 3036 }, { "clip_ratio": 0.0, "completion_length": 305.7890625, "epoch": 1.4833984375, "grad_norm": 1.4044617030039843, "kl": 0.095947265625, "learning_rate": 6.292724609375e-07, "loss": 0.0038, "reward": 1.6602322459220886, "reward_std": 0.1420225277543068, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6758573055267334, "step": 3037 }, { "clip_ratio": 0.0, "completion_length": 368.3203125, "epoch": 1.48388671875, "grad_norm": 1.9738335423607676, "kl": 0.071533203125, "learning_rate": 6.29150390625e-07, "loss": 0.0029, "reward": 1.7275782823562622, "reward_std": 0.08198518864810467, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7432032227516174, "step": 3038 }, { "clip_ratio": 0.0, "completion_length": 305.6015625, "epoch": 1.484375, "grad_norm": 2.0017739219506825, "kl": 0.080322265625, "learning_rate": 6.290283203125e-07, "loss": 0.0032, "reward": 1.644084870815277, "reward_std": 0.0899181142449379, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6518973708152771, "step": 3039 }, { "clip_ratio": 0.0, "completion_length": 256.2421875, "epoch": 1.48486328125, "grad_norm": 0.883112354454319, "kl": 0.084228515625, "learning_rate": 6.289062499999999e-07, "loss": 0.0034, "reward": 1.513433575630188, "reward_std": 0.05119518283754587, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.5134336054325104, "step": 3040 }, { "clip_ratio": 0.0, "completion_length": 313.3359375, "epoch": 1.4853515625, "grad_norm": 1.3435499384819551, "kl": 0.0771484375, "learning_rate": 6.287841796875e-07, "loss": 0.0031, "reward": 1.7283309698104858, "reward_std": 0.10164744779467583, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7283309698104858, "step": 3041 }, { "clip_ratio": 0.0, "completion_length": 288.0625, "epoch": 1.48583984375, "grad_norm": 1.5022112167151482, "kl": 0.0653076171875, "learning_rate": 6.28662109375e-07, "loss": 0.0026, "reward": 1.7406939268112183, "reward_std": 0.07021256536245346, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7406939268112183, "step": 3042 }, { "clip_ratio": 0.0, "completion_length": 305.3359375, "epoch": 1.486328125, "grad_norm": 2.0786694119288205, "kl": 0.083984375, "learning_rate": 6.285400390625e-07, "loss": 0.0034, "reward": 1.7357019186019897, "reward_std": 0.0432198503986001, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7357019484043121, "step": 3043 }, { "clip_ratio": 0.0, "completion_length": 290.1796875, "epoch": 1.48681640625, "grad_norm": 1.4782807841049792, "kl": 0.09716796875, "learning_rate": 6.2841796875e-07, "loss": 0.0039, "reward": 1.7358573079109192, "reward_std": 0.10283184796571732, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7514822483062744, "step": 3044 }, { "clip_ratio": 0.0, "completion_length": 309.5390625, "epoch": 1.4873046875, "grad_norm": 9.234695754183694, "kl": 0.10302734375, "learning_rate": 6.282958984375e-07, "loss": 0.0041, "reward": 1.6147398948669434, "reward_std": 0.14877690002322197, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.6381773948669434, "step": 3045 }, { "clip_ratio": 0.0, "completion_length": 238.9609375, "epoch": 1.48779296875, "grad_norm": 1.043601797533081, "kl": 0.078369140625, "learning_rate": 6.281738281249999e-07, "loss": 0.0031, "reward": 1.7752271890640259, "reward_std": 0.06411982700228691, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7752272188663483, "step": 3046 }, { "clip_ratio": 0.0, "completion_length": 237.59375, "epoch": 1.48828125, "grad_norm": 2.505802779158192, "kl": 0.06298828125, "learning_rate": 6.280517578124999e-07, "loss": 0.0025, "reward": 1.8750739693641663, "reward_std": 0.018464698921889067, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8750739097595215, "step": 3047 }, { "clip_ratio": 0.0, "completion_length": 259.875, "epoch": 1.48876953125, "grad_norm": 1.5420008942864827, "kl": 0.088134765625, "learning_rate": 6.279296875e-07, "loss": 0.0035, "reward": 1.8027490973472595, "reward_std": 0.07335010170936584, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8105616271495819, "step": 3048 }, { "clip_ratio": 0.0, "completion_length": 289.0078125, "epoch": 1.4892578125, "grad_norm": 1.883570258751632, "kl": 0.08447265625, "learning_rate": 6.278076171875e-07, "loss": 0.0034, "reward": 1.8453101515769958, "reward_std": 0.052738748490810394, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8453101217746735, "step": 3049 }, { "clip_ratio": 0.0, "completion_length": 312.65625, "epoch": 1.48974609375, "grad_norm": 1.9780475385606873, "kl": 0.091064453125, "learning_rate": 6.27685546875e-07, "loss": 0.0036, "reward": 1.8935607075691223, "reward_std": 0.05056627467274666, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8935607373714447, "step": 3050 }, { "clip_ratio": 0.0, "completion_length": 270.734375, "epoch": 1.490234375, "grad_norm": 2.9996617070545546, "kl": 0.0687255859375, "learning_rate": 6.275634765625e-07, "loss": 0.0027, "reward": 1.8360978960990906, "reward_std": 0.054630378261208534, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.836097925901413, "step": 3051 }, { "clip_ratio": 0.0, "completion_length": 302.3515625, "epoch": 1.49072265625, "grad_norm": 2.6323826664386236, "kl": 0.079833984375, "learning_rate": 6.2744140625e-07, "loss": 0.0032, "reward": 1.749899685382843, "reward_std": 0.034031180664896965, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7498997449874878, "step": 3052 }, { "clip_ratio": 0.0, "completion_length": 333.5625, "epoch": 1.4912109375, "grad_norm": 1.2654812608116905, "kl": 0.0703125, "learning_rate": 6.273193359374999e-07, "loss": 0.0028, "reward": 1.781448781490326, "reward_std": 0.03715716116130352, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7814488708972931, "step": 3053 }, { "clip_ratio": 0.0, "completion_length": 228.8203125, "epoch": 1.49169921875, "grad_norm": 4.420456353697728, "kl": 0.07373046875, "learning_rate": 6.271972656249999e-07, "loss": 0.0029, "reward": 1.8104448914527893, "reward_std": 0.09506340697407722, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8104448914527893, "step": 3054 }, { "clip_ratio": 0.0, "completion_length": 359.421875, "epoch": 1.4921875, "grad_norm": 1.6948290597573188, "kl": 0.069091796875, "learning_rate": 6.270751953125e-07, "loss": 0.0028, "reward": 1.7458880543708801, "reward_std": 0.08343839459121227, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7537006139755249, "step": 3055 }, { "clip_ratio": 0.0, "completion_length": 305.2734375, "epoch": 1.49267578125, "grad_norm": 6.382155809604849, "kl": 0.069091796875, "learning_rate": 6.26953125e-07, "loss": 0.0028, "reward": 1.8328390717506409, "reward_std": 0.06546132825314999, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8406516313552856, "step": 3056 }, { "clip_ratio": 0.0, "completion_length": 225.0, "epoch": 1.4931640625, "grad_norm": 1.482140538189339, "kl": 0.080078125, "learning_rate": 6.268310546875e-07, "loss": 0.0032, "reward": 1.8376395106315613, "reward_std": 0.046169581823050976, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8376395106315613, "step": 3057 }, { "clip_ratio": 0.0, "completion_length": 302.1171875, "epoch": 1.49365234375, "grad_norm": 1.7613816221555076, "kl": 0.0782470703125, "learning_rate": 6.26708984375e-07, "loss": 0.0031, "reward": 1.6370373368263245, "reward_std": 0.046255904249846935, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6370373964309692, "step": 3058 }, { "clip_ratio": 0.0, "completion_length": 292.8046875, "epoch": 1.494140625, "grad_norm": 2.290078497473891, "kl": 0.065185546875, "learning_rate": 6.265869140624999e-07, "loss": 0.0026, "reward": 1.7369165420532227, "reward_std": 0.14749253168702126, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7369165420532227, "step": 3059 }, { "clip_ratio": 0.0, "completion_length": 279.765625, "epoch": 1.49462890625, "grad_norm": 0.5100306804078655, "kl": 0.0810546875, "learning_rate": 6.264648437499999e-07, "loss": 0.0032, "reward": 1.8437798023223877, "reward_std": 0.03653890639543533, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8437797725200653, "step": 3060 }, { "clip_ratio": 0.0, "completion_length": 233.2109375, "epoch": 1.4951171875, "grad_norm": 1.6593962077313371, "kl": 0.08984375, "learning_rate": 6.263427734375e-07, "loss": 0.0036, "reward": 1.6682219505310059, "reward_std": 0.019469616003334522, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6682219803333282, "step": 3061 }, { "clip_ratio": 0.0, "completion_length": 265.84375, "epoch": 1.49560546875, "grad_norm": 1.8396284831116319, "kl": 0.0830078125, "learning_rate": 6.26220703125e-07, "loss": 0.0033, "reward": 1.727663278579712, "reward_std": 0.06270462274551392, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7276632189750671, "step": 3062 }, { "clip_ratio": 0.0, "completion_length": 284.6171875, "epoch": 1.49609375, "grad_norm": 1.0117046188705436, "kl": 0.0732421875, "learning_rate": 6.260986328125e-07, "loss": 0.0029, "reward": 1.7094378471374512, "reward_std": 0.02379227802157402, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7094378173351288, "step": 3063 }, { "clip_ratio": 0.0, "completion_length": 290.8828125, "epoch": 1.49658203125, "grad_norm": 1.3497291028510259, "kl": 0.090576171875, "learning_rate": 6.259765625e-07, "loss": 0.0036, "reward": 1.7751423716545105, "reward_std": 0.13514219038188457, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7907673716545105, "step": 3064 }, { "clip_ratio": 0.0, "completion_length": 338.46875, "epoch": 1.4970703125, "grad_norm": 3.4190881996802682, "kl": 0.0592041015625, "learning_rate": 6.258544921875e-07, "loss": 0.0024, "reward": 1.764043927192688, "reward_std": 0.07390506565570831, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.771856427192688, "step": 3065 }, { "clip_ratio": 0.0, "completion_length": 301.140625, "epoch": 1.49755859375, "grad_norm": 2.0795235433392705, "kl": 0.0665283203125, "learning_rate": 6.257324218749999e-07, "loss": 0.0027, "reward": 1.8003657460212708, "reward_std": 0.05936916545033455, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8003657460212708, "step": 3066 }, { "clip_ratio": 0.0, "completion_length": 278.78125, "epoch": 1.498046875, "grad_norm": 1.2096859898867558, "kl": 0.0479736328125, "learning_rate": 6.256103515624999e-07, "loss": 0.0019, "reward": 1.7172734141349792, "reward_std": 0.028629466891288757, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7172734439373016, "step": 3067 }, { "clip_ratio": 0.0, "completion_length": 208.25, "epoch": 1.49853515625, "grad_norm": 0.837342886906685, "kl": 0.0615234375, "learning_rate": 6.2548828125e-07, "loss": 0.0025, "reward": 1.7248526215553284, "reward_std": 0.04399119131267071, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7248526215553284, "step": 3068 }, { "clip_ratio": 0.0, "completion_length": 313.0234375, "epoch": 1.4990234375, "grad_norm": 0.8384243546554345, "kl": 0.0643310546875, "learning_rate": 6.253662109375e-07, "loss": 0.0026, "reward": 1.7257351875305176, "reward_std": 0.09183939173817635, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7413601875305176, "step": 3069 }, { "clip_ratio": 0.0, "completion_length": 235.1875, "epoch": 1.49951171875, "grad_norm": 1.1447058461259672, "kl": 0.074951171875, "learning_rate": 6.25244140625e-07, "loss": 0.003, "reward": 1.81594717502594, "reward_std": 0.021475983783602715, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8159471154212952, "step": 3070 }, { "clip_ratio": 0.0, "completion_length": 316.921875, "epoch": 1.5, "grad_norm": 1.7077930126684038, "kl": 0.07080078125, "learning_rate": 6.251220703125e-07, "loss": 0.0028, "reward": 1.6829584240913391, "reward_std": 0.1759318709373474, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.7376458644866943, "step": 3071 }, { "clip_ratio": 0.0, "completion_length": 222.5859375, "epoch": 1.50048828125, "grad_norm": 4.251619392395568, "kl": 0.079345703125, "learning_rate": 6.249999999999999e-07, "loss": 0.0032, "reward": 1.7641262412071228, "reward_std": 0.10015225410461426, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7797512412071228, "step": 3072 }, { "clip_ratio": 0.0, "completion_length": 293.0546875, "epoch": 1.5009765625, "grad_norm": 2.037037846840877, "kl": 0.0662841796875, "learning_rate": 6.248779296874999e-07, "loss": 0.0027, "reward": 1.8416993618011475, "reward_std": 0.0264980373904109, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8416993618011475, "step": 3073 }, { "clip_ratio": 0.0, "completion_length": 214.3125, "epoch": 1.50146484375, "grad_norm": 2.4016129108930446, "kl": 0.07373046875, "learning_rate": 6.24755859375e-07, "loss": 0.003, "reward": 1.884174108505249, "reward_std": 0.01571572571992874, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8841741383075714, "step": 3074 }, { "clip_ratio": 0.0, "completion_length": 393.6953125, "epoch": 1.501953125, "grad_norm": 14.731726849066805, "kl": 0.0576171875, "learning_rate": 6.246337890625e-07, "loss": 0.0023, "reward": 1.8040361404418945, "reward_std": 0.1080729328095913, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8274736106395721, "step": 3075 }, { "clip_ratio": 0.0, "completion_length": 280.3984375, "epoch": 1.50244140625, "grad_norm": 2.9540282421835924, "kl": 0.0684814453125, "learning_rate": 6.2451171875e-07, "loss": 0.0027, "reward": 1.8184278011322021, "reward_std": 0.05250486359000206, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8184277713298798, "step": 3076 }, { "clip_ratio": 0.0, "completion_length": 358.6875, "epoch": 1.5029296875, "grad_norm": 1.8338223896953099, "kl": 0.0611572265625, "learning_rate": 6.243896484375e-07, "loss": 0.0024, "reward": 1.7305577397346497, "reward_std": 0.09738858230412006, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7618077993392944, "step": 3077 }, { "clip_ratio": 0.0, "completion_length": 305.2421875, "epoch": 1.50341796875, "grad_norm": 1.4824856231273853, "kl": 0.07666015625, "learning_rate": 6.24267578125e-07, "loss": 0.0031, "reward": 1.8530486822128296, "reward_std": 0.06430929712951183, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8530486822128296, "step": 3078 }, { "clip_ratio": 0.0, "completion_length": 314.5859375, "epoch": 1.50390625, "grad_norm": 1.2200578050818203, "kl": 0.06494140625, "learning_rate": 6.241455078124999e-07, "loss": 0.0026, "reward": 1.869605302810669, "reward_std": 0.053065571933984756, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8696053326129913, "step": 3079 }, { "clip_ratio": 0.0, "completion_length": 281.59375, "epoch": 1.50439453125, "grad_norm": 2.107871829584508, "kl": 0.086669921875, "learning_rate": 6.240234374999999e-07, "loss": 0.0035, "reward": 1.7725425362586975, "reward_std": 0.08174478355795145, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7725425660610199, "step": 3080 }, { "clip_ratio": 0.0, "completion_length": 305.765625, "epoch": 1.5048828125, "grad_norm": 2.090164106693488, "kl": 0.0560302734375, "learning_rate": 6.239013671875e-07, "loss": 0.0022, "reward": 1.677848756313324, "reward_std": 0.06150129809975624, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6778488159179688, "step": 3081 }, { "clip_ratio": 0.0, "completion_length": 212.78125, "epoch": 1.50537109375, "grad_norm": 1.9087461795639435, "kl": 0.08447265625, "learning_rate": 6.23779296875e-07, "loss": 0.0034, "reward": 1.8541353940963745, "reward_std": 0.09079751744866371, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8541353642940521, "step": 3082 }, { "clip_ratio": 0.0, "completion_length": 296.2890625, "epoch": 1.505859375, "grad_norm": 1.3215498934075522, "kl": 0.066162109375, "learning_rate": 6.236572265625e-07, "loss": 0.0026, "reward": 1.838699460029602, "reward_std": 0.1073538176715374, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8543243706226349, "step": 3083 }, { "clip_ratio": 0.0, "completion_length": 286.96875, "epoch": 1.50634765625, "grad_norm": 1.6207069275912624, "kl": 0.059814453125, "learning_rate": 6.2353515625e-07, "loss": 0.0024, "reward": 1.8489339351654053, "reward_std": 0.08780923672020435, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.8958089351654053, "step": 3084 }, { "clip_ratio": 0.0, "completion_length": 264.8359375, "epoch": 1.5068359375, "grad_norm": 1.8268450463781, "kl": 0.0859375, "learning_rate": 6.234130859374999e-07, "loss": 0.0034, "reward": 1.855950951576233, "reward_std": 0.04823304433375597, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8559509217739105, "step": 3085 }, { "clip_ratio": 0.0, "completion_length": 258.71875, "epoch": 1.50732421875, "grad_norm": 0.8367633020259986, "kl": 0.09033203125, "learning_rate": 6.232910156249999e-07, "loss": 0.0036, "reward": 1.7904430627822876, "reward_std": 0.04001910053193569, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7904431223869324, "step": 3086 }, { "clip_ratio": 0.0, "completion_length": 291.59375, "epoch": 1.5078125, "grad_norm": 2.52179471226621, "kl": 0.095703125, "learning_rate": 6.231689453125e-07, "loss": 0.0038, "reward": 2.024749219417572, "reward_std": 0.08385680988430977, "rewards/format_reward": 1.0, "rewards/ocr_reward": 1.0247493088245392, "step": 3087 }, { "clip_ratio": 0.0, "completion_length": 277.1171875, "epoch": 1.50830078125, "grad_norm": 1.0623002349275934, "kl": 0.074951171875, "learning_rate": 6.23046875e-07, "loss": 0.003, "reward": 1.8043740391731262, "reward_std": 0.04912651889026165, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8043740093708038, "step": 3088 }, { "clip_ratio": 0.0, "completion_length": 340.015625, "epoch": 1.5087890625, "grad_norm": 2.3861784634827097, "kl": 0.069580078125, "learning_rate": 6.229248046875e-07, "loss": 0.0028, "reward": 1.7216225266456604, "reward_std": 0.05122903361916542, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.721622496843338, "step": 3089 }, { "clip_ratio": 0.0, "completion_length": 266.96875, "epoch": 1.50927734375, "grad_norm": 9.04191328799389, "kl": 0.0845947265625, "learning_rate": 6.22802734375e-07, "loss": 0.0034, "reward": 1.7171977162361145, "reward_std": 0.03181068133562803, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7171976864337921, "step": 3090 }, { "clip_ratio": 0.0, "completion_length": 292.4609375, "epoch": 1.509765625, "grad_norm": 1.5795526400740247, "kl": 0.075439453125, "learning_rate": 6.226806640625e-07, "loss": 0.003, "reward": 1.6946417689323425, "reward_std": 0.08766061812639236, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7180792689323425, "step": 3091 }, { "clip_ratio": 0.0, "completion_length": 288.0234375, "epoch": 1.51025390625, "grad_norm": 1.7844017464871738, "kl": 0.0859375, "learning_rate": 6.225585937499999e-07, "loss": 0.0034, "reward": 1.8546817898750305, "reward_std": 0.05815475434064865, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8546817898750305, "step": 3092 }, { "clip_ratio": 0.0, "completion_length": 233.109375, "epoch": 1.5107421875, "grad_norm": 0.6275503497060736, "kl": 0.07763671875, "learning_rate": 6.224365234374999e-07, "loss": 0.0031, "reward": 1.9210276007652283, "reward_std": 0.014174860902130604, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9210276007652283, "step": 3093 }, { "clip_ratio": 0.0, "completion_length": 281.0234375, "epoch": 1.51123046875, "grad_norm": 2.8960257527388484, "kl": 0.07421875, "learning_rate": 6.22314453125e-07, "loss": 0.003, "reward": 1.6736098527908325, "reward_std": 0.07500293478369713, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6736098527908325, "step": 3094 }, { "clip_ratio": 0.0, "completion_length": 168.0625, "epoch": 1.51171875, "grad_norm": 3.3937763447140306, "kl": 0.106201171875, "learning_rate": 6.221923828125e-07, "loss": 0.0043, "reward": 1.6761849522590637, "reward_std": 0.15080446749925613, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6839974224567413, "step": 3095 }, { "clip_ratio": 0.0, "completion_length": 271.1015625, "epoch": 1.51220703125, "grad_norm": 0.7920607825397445, "kl": 0.0640869140625, "learning_rate": 6.220703125e-07, "loss": 0.0026, "reward": 1.7301459312438965, "reward_std": 0.030234874226152897, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7301459610462189, "step": 3096 }, { "clip_ratio": 0.0, "completion_length": 300.2890625, "epoch": 1.5126953125, "grad_norm": 1.725648229722019, "kl": 0.0693359375, "learning_rate": 6.219482421875e-07, "loss": 0.0028, "reward": 1.7913671731948853, "reward_std": 0.04710565786808729, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7913671433925629, "step": 3097 }, { "clip_ratio": 0.0, "completion_length": 328.0625, "epoch": 1.51318359375, "grad_norm": 5.525808172297369, "kl": 0.062744140625, "learning_rate": 6.21826171875e-07, "loss": 0.0025, "reward": 1.8240219950675964, "reward_std": 0.05396724492311478, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.824021965265274, "step": 3098 }, { "clip_ratio": 0.0, "completion_length": 286.078125, "epoch": 1.513671875, "grad_norm": 1.0932630221981614, "kl": 0.0712890625, "learning_rate": 6.217041015624999e-07, "loss": 0.0029, "reward": 1.882387936115265, "reward_std": 0.03053974825888872, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8823879063129425, "step": 3099 }, { "clip_ratio": 0.0, "completion_length": 308.8828125, "epoch": 1.51416015625, "grad_norm": 0.8220993732465702, "kl": 0.081298828125, "learning_rate": 6.2158203125e-07, "loss": 0.0032, "reward": 1.723827600479126, "reward_std": 0.02734041726216674, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.723827600479126, "step": 3100 }, { "clip_ratio": 0.0, "completion_length": 285.734375, "epoch": 1.5146484375, "grad_norm": 0.7732507508307016, "kl": 0.06884765625, "learning_rate": 6.214599609375e-07, "loss": 0.0028, "reward": 1.68502938747406, "reward_std": 0.06600722670555115, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6850293874740601, "step": 3101 }, { "clip_ratio": 0.0, "completion_length": 276.421875, "epoch": 1.51513671875, "grad_norm": 1.8060542775559811, "kl": 0.0791015625, "learning_rate": 6.21337890625e-07, "loss": 0.0032, "reward": 1.725981593132019, "reward_std": 0.05310596153140068, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.725981593132019, "step": 3102 }, { "clip_ratio": 0.0, "completion_length": 266.4375, "epoch": 1.515625, "grad_norm": 2.287645696309139, "kl": 0.07177734375, "learning_rate": 6.212158203125e-07, "loss": 0.0029, "reward": 1.8063457012176514, "reward_std": 0.060521697625517845, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8063457310199738, "step": 3103 }, { "clip_ratio": 0.0, "completion_length": 261.0703125, "epoch": 1.51611328125, "grad_norm": 1.7967027573346939, "kl": 0.0618896484375, "learning_rate": 6.2109375e-07, "loss": 0.0025, "reward": 1.766247570514679, "reward_std": 0.027749599888920784, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.766247570514679, "step": 3104 }, { "clip_ratio": 0.0, "completion_length": 308.84375, "epoch": 1.5166015625, "grad_norm": 0.4646725109012026, "kl": 0.078125, "learning_rate": 6.209716796874999e-07, "loss": 0.0031, "reward": 1.7971450686454773, "reward_std": 0.024683097377419472, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7971450686454773, "step": 3105 }, { "clip_ratio": 0.0, "completion_length": 228.4453125, "epoch": 1.51708984375, "grad_norm": 1.8283999063737095, "kl": 0.0662841796875, "learning_rate": 6.208496093749999e-07, "loss": 0.0027, "reward": 1.9062353372573853, "reward_std": 0.07536712661385536, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9062353372573853, "step": 3106 }, { "clip_ratio": 0.0, "completion_length": 262.1171875, "epoch": 1.517578125, "grad_norm": 17.037657891691474, "kl": 0.09130859375, "learning_rate": 6.207275390625e-07, "loss": 0.0037, "reward": 1.7185717821121216, "reward_std": 0.06076034903526306, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7185717523097992, "step": 3107 }, { "clip_ratio": 0.0, "completion_length": 331.8046875, "epoch": 1.51806640625, "grad_norm": 2.418724869062214, "kl": 0.079833984375, "learning_rate": 6.2060546875e-07, "loss": 0.0032, "reward": 1.7952438592910767, "reward_std": 0.09894811734557152, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8030563592910767, "step": 3108 }, { "clip_ratio": 0.0, "completion_length": 298.8984375, "epoch": 1.5185546875, "grad_norm": 1.6978086850538485, "kl": 0.072265625, "learning_rate": 6.204833984375e-07, "loss": 0.0029, "reward": 1.7135571241378784, "reward_std": 0.05942201055586338, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7135571241378784, "step": 3109 }, { "clip_ratio": 0.0, "completion_length": 270.5703125, "epoch": 1.51904296875, "grad_norm": 11.000253342045735, "kl": 0.07666015625, "learning_rate": 6.20361328125e-07, "loss": 0.0031, "reward": 1.7439785599708557, "reward_std": 0.02391317579895258, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7439785599708557, "step": 3110 }, { "clip_ratio": 0.0, "completion_length": 302.25, "epoch": 1.51953125, "grad_norm": 2.1906764680903854, "kl": 0.066162109375, "learning_rate": 6.202392578125e-07, "loss": 0.0026, "reward": 1.8155664801597595, "reward_std": 0.06511466577649117, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8233789801597595, "step": 3111 }, { "clip_ratio": 0.0, "completion_length": 224.1875, "epoch": 1.52001953125, "grad_norm": 3.584987719765566, "kl": 0.0677490234375, "learning_rate": 6.201171874999999e-07, "loss": 0.0027, "reward": 1.8200541734695435, "reward_std": 0.049682820681482553, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8200541734695435, "step": 3112 }, { "clip_ratio": 0.0, "completion_length": 303.0859375, "epoch": 1.5205078125, "grad_norm": 2.018089591759681, "kl": 0.0748291015625, "learning_rate": 6.199951171875e-07, "loss": 0.003, "reward": 1.564685881137848, "reward_std": 0.1015004925429821, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.5724983513355255, "step": 3113 }, { "clip_ratio": 0.0, "completion_length": 257.7890625, "epoch": 1.52099609375, "grad_norm": 2.3565598111007593, "kl": 0.068603515625, "learning_rate": 6.19873046875e-07, "loss": 0.0027, "reward": 1.8028762936592102, "reward_std": 0.058065131306648254, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8028763234615326, "step": 3114 }, { "clip_ratio": 0.0, "completion_length": 287.546875, "epoch": 1.521484375, "grad_norm": 1.3845163583688693, "kl": 0.0859375, "learning_rate": 6.197509765625e-07, "loss": 0.0034, "reward": 1.8404591083526611, "reward_std": 0.04378460347652435, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8404591083526611, "step": 3115 }, { "clip_ratio": 0.0, "completion_length": 303.625, "epoch": 1.52197265625, "grad_norm": 1.321361217819104, "kl": 0.0712890625, "learning_rate": 6.1962890625e-07, "loss": 0.0029, "reward": 1.8013597130775452, "reward_std": 0.11758016794919968, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8247972130775452, "step": 3116 }, { "clip_ratio": 0.0, "completion_length": 261.46875, "epoch": 1.5224609375, "grad_norm": 0.9992851869601573, "kl": 0.0771484375, "learning_rate": 6.195068359375e-07, "loss": 0.0031, "reward": 1.8446565866470337, "reward_std": 0.04555722698569298, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8446565270423889, "step": 3117 }, { "clip_ratio": 0.0, "completion_length": 271.0546875, "epoch": 1.52294921875, "grad_norm": 1.3223466253645542, "kl": 0.0623779296875, "learning_rate": 6.193847656249999e-07, "loss": 0.0025, "reward": 1.7789299488067627, "reward_std": 0.06991294771432877, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7867424488067627, "step": 3118 }, { "clip_ratio": 0.0, "completion_length": 350.578125, "epoch": 1.5234375, "grad_norm": 2.6833880146707907, "kl": 0.048095703125, "learning_rate": 6.192626953124999e-07, "loss": 0.0019, "reward": 1.8840059041976929, "reward_std": 0.05926818028092384, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8840057849884033, "step": 3119 }, { "clip_ratio": 0.0, "completion_length": 300.46875, "epoch": 1.52392578125, "grad_norm": 1.4510987621579094, "kl": 0.068603515625, "learning_rate": 6.19140625e-07, "loss": 0.0027, "reward": 1.7248252034187317, "reward_std": 0.07782328687608242, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7248252332210541, "step": 3120 }, { "clip_ratio": 0.0, "completion_length": 296.4375, "epoch": 1.5244140625, "grad_norm": 1.136138045433055, "kl": 0.06982421875, "learning_rate": 6.190185546875e-07, "loss": 0.0028, "reward": 1.5833680629730225, "reward_std": 0.046957019716501236, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.583368107676506, "step": 3121 }, { "clip_ratio": 0.0, "completion_length": 264.859375, "epoch": 1.52490234375, "grad_norm": 5.258286501764103, "kl": 0.095703125, "learning_rate": 6.18896484375e-07, "loss": 0.0038, "reward": 1.8398154973983765, "reward_std": 0.07556849718093872, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8398154675960541, "step": 3122 }, { "clip_ratio": 0.0, "completion_length": 238.9921875, "epoch": 1.525390625, "grad_norm": 3.19613212286148, "kl": 0.08203125, "learning_rate": 6.187744140625e-07, "loss": 0.0033, "reward": 1.7610740661621094, "reward_std": 0.08400712162256241, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7610740661621094, "step": 3123 }, { "clip_ratio": 0.0, "completion_length": 241.1328125, "epoch": 1.52587890625, "grad_norm": 1.2440046063167502, "kl": 0.0677490234375, "learning_rate": 6.1865234375e-07, "loss": 0.0027, "reward": 1.7349917888641357, "reward_std": 0.0564101692289114, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7349918186664581, "step": 3124 }, { "clip_ratio": 0.0, "completion_length": 239.609375, "epoch": 1.5263671875, "grad_norm": 1.9358611243229036, "kl": 0.0606689453125, "learning_rate": 6.185302734374999e-07, "loss": 0.0024, "reward": 1.871264934539795, "reward_std": 0.01118523720651865, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8712649047374725, "step": 3125 }, { "clip_ratio": 0.0, "completion_length": 254.4609375, "epoch": 1.52685546875, "grad_norm": 0.7636233474618398, "kl": 0.07373046875, "learning_rate": 6.18408203125e-07, "loss": 0.003, "reward": 1.7732288837432861, "reward_std": 0.04541287012398243, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7732289135456085, "step": 3126 }, { "clip_ratio": 0.0, "completion_length": 279.5703125, "epoch": 1.52734375, "grad_norm": 2.2485695960334566, "kl": 0.078125, "learning_rate": 6.182861328125e-07, "loss": 0.0031, "reward": 1.7269670367240906, "reward_std": 0.056932706385850906, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7269670069217682, "step": 3127 }, { "clip_ratio": 0.0, "completion_length": 250.875, "epoch": 1.52783203125, "grad_norm": 2.0938435538417135, "kl": 0.07080078125, "learning_rate": 6.181640625e-07, "loss": 0.0028, "reward": 1.8258466124534607, "reward_std": 0.034098366275429726, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8258466720581055, "step": 3128 }, { "clip_ratio": 0.0, "completion_length": 218.125, "epoch": 1.5283203125, "grad_norm": 6.351273063845393, "kl": 0.08837890625, "learning_rate": 6.180419921875e-07, "loss": 0.0035, "reward": 1.6825706362724304, "reward_std": 0.06605061516165733, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6825706958770752, "step": 3129 }, { "clip_ratio": 0.0, "completion_length": 242.8828125, "epoch": 1.52880859375, "grad_norm": 1.73857162773802, "kl": 0.0908203125, "learning_rate": 6.17919921875e-07, "loss": 0.0036, "reward": 1.8035194873809814, "reward_std": 0.06293283682316542, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8035195171833038, "step": 3130 }, { "clip_ratio": 0.0, "completion_length": 283.90625, "epoch": 1.529296875, "grad_norm": 1.8707235948004883, "kl": 0.0869140625, "learning_rate": 6.177978515624999e-07, "loss": 0.0035, "reward": 1.85841304063797, "reward_std": 0.05445600301027298, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.85841304063797, "step": 3131 }, { "clip_ratio": 0.0, "completion_length": 294.03125, "epoch": 1.52978515625, "grad_norm": 1.4148200869799006, "kl": 0.0732421875, "learning_rate": 6.176757812499999e-07, "loss": 0.0029, "reward": 1.7566508054733276, "reward_std": 0.0620297584682703, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7566508650779724, "step": 3132 }, { "clip_ratio": 0.0, "completion_length": 286.8671875, "epoch": 1.5302734375, "grad_norm": 3.895550568287067, "kl": 0.0673828125, "learning_rate": 6.175537109375e-07, "loss": 0.0027, "reward": 1.735254943370819, "reward_std": 0.08439107239246368, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7352549433708191, "step": 3133 }, { "clip_ratio": 0.0, "completion_length": 236.5546875, "epoch": 1.53076171875, "grad_norm": 1.1219004153321612, "kl": 0.0908203125, "learning_rate": 6.17431640625e-07, "loss": 0.0036, "reward": 1.7691398859024048, "reward_std": 0.03892973717302084, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7691399157047272, "step": 3134 }, { "clip_ratio": 0.0, "completion_length": 333.359375, "epoch": 1.53125, "grad_norm": 5.821411252101163, "kl": 0.08349609375, "learning_rate": 6.173095703125e-07, "loss": 0.0033, "reward": 1.6651095747947693, "reward_std": 0.07258575409650803, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6651095747947693, "step": 3135 }, { "clip_ratio": 0.0, "completion_length": 265.3046875, "epoch": 1.53173828125, "grad_norm": 1.82841994581108, "kl": 0.0751953125, "learning_rate": 6.171875e-07, "loss": 0.003, "reward": 1.8692357540130615, "reward_std": 0.0736299641430378, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8692357242107391, "step": 3136 }, { "clip_ratio": 0.0, "completion_length": 262.78125, "epoch": 1.5322265625, "grad_norm": 2.0403074353548045, "kl": 0.08203125, "learning_rate": 6.170654296875e-07, "loss": 0.0033, "reward": 1.61654931306839, "reward_std": 0.022080027498304844, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6165493130683899, "step": 3137 }, { "clip_ratio": 0.0, "completion_length": 237.546875, "epoch": 1.53271484375, "grad_norm": 18.76627322206088, "kl": 0.08447265625, "learning_rate": 6.169433593749999e-07, "loss": 0.0034, "reward": 1.7105411887168884, "reward_std": 0.09866257756948471, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7105412185192108, "step": 3138 }, { "clip_ratio": 0.0, "completion_length": 351.53125, "epoch": 1.533203125, "grad_norm": 2.305963685532532, "kl": 0.058837890625, "learning_rate": 6.168212890625e-07, "loss": 0.0024, "reward": 1.7301769852638245, "reward_std": 0.13028892129659653, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7458019852638245, "step": 3139 }, { "clip_ratio": 0.0, "completion_length": 206.3203125, "epoch": 1.53369140625, "grad_norm": 0.24505066269967637, "kl": 0.0689697265625, "learning_rate": 6.1669921875e-07, "loss": 0.0028, "reward": 1.7604427337646484, "reward_std": 0.028698831796646118, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.760442703962326, "step": 3140 }, { "clip_ratio": 0.0, "completion_length": 221.9921875, "epoch": 1.5341796875, "grad_norm": 1.735357218192801, "kl": 0.092529296875, "learning_rate": 6.165771484375e-07, "loss": 0.0037, "reward": 1.8620553016662598, "reward_std": 0.03215474262833595, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8620553314685822, "step": 3141 }, { "clip_ratio": 0.0, "completion_length": 293.4375, "epoch": 1.53466796875, "grad_norm": 2.8803153468108436, "kl": 0.076416015625, "learning_rate": 6.16455078125e-07, "loss": 0.0031, "reward": 1.8484528064727783, "reward_std": 0.07194521278142929, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8484528362751007, "step": 3142 }, { "clip_ratio": 0.0, "completion_length": 302.7734375, "epoch": 1.53515625, "grad_norm": 2.679606476186728, "kl": 0.0750732421875, "learning_rate": 6.163330078125e-07, "loss": 0.003, "reward": 1.7876355051994324, "reward_std": 0.12041214294731617, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7954480051994324, "step": 3143 }, { "clip_ratio": 0.0, "completion_length": 331.953125, "epoch": 1.53564453125, "grad_norm": 1.616840595359916, "kl": 0.06201171875, "learning_rate": 6.162109374999999e-07, "loss": 0.0025, "reward": 1.8307392001152039, "reward_std": 0.03423440642654896, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8307392001152039, "step": 3144 }, { "clip_ratio": 0.0, "completion_length": 209.140625, "epoch": 1.5361328125, "grad_norm": 2.036514638498422, "kl": 0.10400390625, "learning_rate": 6.160888671874999e-07, "loss": 0.0042, "reward": 1.8995371460914612, "reward_std": 0.08496665954589844, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8995371758937836, "step": 3145 }, { "clip_ratio": 0.0, "completion_length": 261.5859375, "epoch": 1.53662109375, "grad_norm": 1.0505257847917335, "kl": 0.0780029296875, "learning_rate": 6.15966796875e-07, "loss": 0.0031, "reward": 1.7895857691764832, "reward_std": 0.032869850285351276, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7895857095718384, "step": 3146 }, { "clip_ratio": 0.0, "completion_length": 357.4140625, "epoch": 1.537109375, "grad_norm": 2.49269461000481, "kl": 0.087646484375, "learning_rate": 6.158447265625e-07, "loss": 0.0035, "reward": 1.729568362236023, "reward_std": 0.14366939291357994, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7530059218406677, "step": 3147 }, { "clip_ratio": 0.0, "completion_length": 275.53125, "epoch": 1.53759765625, "grad_norm": 0.9767868294994723, "kl": 0.06982421875, "learning_rate": 6.1572265625e-07, "loss": 0.0028, "reward": 1.9281029105186462, "reward_std": 0.06500357203185558, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9281029105186462, "step": 3148 }, { "clip_ratio": 0.0, "completion_length": 298.53125, "epoch": 1.5380859375, "grad_norm": 1.567228738812135, "kl": 0.080078125, "learning_rate": 6.156005859375e-07, "loss": 0.0032, "reward": 1.8201581239700317, "reward_std": 0.07531145215034485, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8279706239700317, "step": 3149 }, { "clip_ratio": 0.0, "completion_length": 234.1015625, "epoch": 1.53857421875, "grad_norm": 1.37480935198485, "kl": 0.07861328125, "learning_rate": 6.15478515625e-07, "loss": 0.0031, "reward": 1.8080313205718994, "reward_std": 0.02758025284856558, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8080313503742218, "step": 3150 }, { "clip_ratio": 0.0, "completion_length": 265.0390625, "epoch": 1.5390625, "grad_norm": 1.8428019019258013, "kl": 0.096435546875, "learning_rate": 6.153564453124999e-07, "loss": 0.0039, "reward": 1.6825060844421387, "reward_std": 0.03835061937570572, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6825061142444611, "step": 3151 }, { "clip_ratio": 0.0, "completion_length": 282.46875, "epoch": 1.53955078125, "grad_norm": 1.7497641001072382, "kl": 0.104736328125, "learning_rate": 6.152343749999999e-07, "loss": 0.0042, "reward": 1.747897982597351, "reward_std": 0.04787810705602169, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7478979229927063, "step": 3152 }, { "clip_ratio": 0.0, "completion_length": 230.34375, "epoch": 1.5400390625, "grad_norm": 5.833818291905867, "kl": 0.089599609375, "learning_rate": 6.151123046875e-07, "loss": 0.0036, "reward": 1.6996482610702515, "reward_std": 0.033032437320798635, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6996482610702515, "step": 3153 }, { "clip_ratio": 0.0, "completion_length": 257.1953125, "epoch": 1.54052734375, "grad_norm": 3.41976551776808, "kl": 0.0748291015625, "learning_rate": 6.14990234375e-07, "loss": 0.003, "reward": 1.7531882524490356, "reward_std": 0.01387872640043497, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7531882524490356, "step": 3154 }, { "clip_ratio": 0.0, "completion_length": 298.2578125, "epoch": 1.541015625, "grad_norm": 1.8858479830744364, "kl": 0.067626953125, "learning_rate": 6.148681640625e-07, "loss": 0.0027, "reward": 1.6333616375923157, "reward_std": 0.07776164263486862, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6411741375923157, "step": 3155 }, { "clip_ratio": 0.0, "completion_length": 329.0078125, "epoch": 1.54150390625, "grad_norm": 0.8035865329277778, "kl": 0.05712890625, "learning_rate": 6.1474609375e-07, "loss": 0.0023, "reward": 1.7784574627876282, "reward_std": 0.05039230780676007, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7784574329853058, "step": 3156 }, { "clip_ratio": 0.0, "completion_length": 280.9296875, "epoch": 1.5419921875, "grad_norm": 6.951850739468607, "kl": 0.0699462890625, "learning_rate": 6.146240234374999e-07, "loss": 0.0028, "reward": 1.7199862003326416, "reward_std": 0.1958215907216072, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7199861109256744, "step": 3157 }, { "clip_ratio": 0.0, "completion_length": 229.9453125, "epoch": 1.54248046875, "grad_norm": 3.9343633726846288, "kl": 0.076904296875, "learning_rate": 6.145019531249999e-07, "loss": 0.0031, "reward": 1.8535465002059937, "reward_std": 0.04394886875525117, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8535465002059937, "step": 3158 }, { "clip_ratio": 0.0, "completion_length": 294.109375, "epoch": 1.54296875, "grad_norm": 1.6757964157727798, "kl": 0.0643310546875, "learning_rate": 6.143798828125e-07, "loss": 0.0026, "reward": 1.8247731924057007, "reward_std": 0.11207094416022301, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8325856924057007, "step": 3159 }, { "clip_ratio": 0.0, "completion_length": 375.9140625, "epoch": 1.54345703125, "grad_norm": 1.5482408835535821, "kl": 0.057373046875, "learning_rate": 6.142578125e-07, "loss": 0.0023, "reward": 1.7898805737495422, "reward_std": 0.08811133727431297, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7898805737495422, "step": 3160 }, { "clip_ratio": 0.0, "completion_length": 296.7890625, "epoch": 1.5439453125, "grad_norm": 1.4270590005546342, "kl": 0.06005859375, "learning_rate": 6.141357421875e-07, "loss": 0.0024, "reward": 1.7193145751953125, "reward_std": 0.12322738021612167, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7271271049976349, "step": 3161 }, { "clip_ratio": 0.0, "completion_length": 223.328125, "epoch": 1.54443359375, "grad_norm": 8.265490886129752, "kl": 0.0648193359375, "learning_rate": 6.14013671875e-07, "loss": 0.0026, "reward": 1.8019053936004639, "reward_std": 0.03768607368692756, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8019054532051086, "step": 3162 }, { "clip_ratio": 0.0, "completion_length": 252.9921875, "epoch": 1.544921875, "grad_norm": 2.5739517984161697, "kl": 0.07080078125, "learning_rate": 6.138916015625e-07, "loss": 0.0028, "reward": 1.8906748294830322, "reward_std": 0.04288986138999462, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.890674889087677, "step": 3163 }, { "clip_ratio": 0.0, "completion_length": 309.859375, "epoch": 1.54541015625, "grad_norm": 4.09072500677864, "kl": 0.068115234375, "learning_rate": 6.137695312499999e-07, "loss": 0.0027, "reward": 1.8343228101730347, "reward_std": 0.07122788205742836, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8421353101730347, "step": 3164 }, { "clip_ratio": 0.0, "completion_length": 268.8671875, "epoch": 1.5458984375, "grad_norm": 2.310781538336421, "kl": 0.0704345703125, "learning_rate": 6.136474609374999e-07, "loss": 0.0028, "reward": 1.69595205783844, "reward_std": 0.13838719576597214, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7115771174430847, "step": 3165 }, { "clip_ratio": 0.0, "completion_length": 278.8828125, "epoch": 1.54638671875, "grad_norm": 7.1679887718503545, "kl": 0.076904296875, "learning_rate": 6.13525390625e-07, "loss": 0.0031, "reward": 1.631429135799408, "reward_std": 0.11012212559580803, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.639241635799408, "step": 3166 }, { "clip_ratio": 0.0, "completion_length": 383.484375, "epoch": 1.546875, "grad_norm": 1.8191201659721, "kl": 0.0643310546875, "learning_rate": 6.134033203125e-07, "loss": 0.0026, "reward": 1.7357022166252136, "reward_std": 0.16989228129386902, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7747646570205688, "step": 3167 }, { "clip_ratio": 0.0, "completion_length": 348.953125, "epoch": 1.54736328125, "grad_norm": 1.402226497605308, "kl": 0.090576171875, "learning_rate": 6.1328125e-07, "loss": 0.0036, "reward": 1.6996177434921265, "reward_std": 0.12328409217298031, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7152426540851593, "step": 3168 }, { "clip_ratio": 0.0, "completion_length": 243.25, "epoch": 1.5478515625, "grad_norm": 1.2654508230027297, "kl": 0.0609130859375, "learning_rate": 6.131591796875e-07, "loss": 0.0024, "reward": 1.8366875052452087, "reward_std": 0.04399787541478872, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8366875350475311, "step": 3169 }, { "clip_ratio": 0.0, "completion_length": 292.421875, "epoch": 1.54833984375, "grad_norm": 7.351539110964937, "kl": 0.0694580078125, "learning_rate": 6.130371093749999e-07, "loss": 0.0028, "reward": 1.8194851875305176, "reward_std": 0.04994682688266039, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8194851577281952, "step": 3170 }, { "clip_ratio": 0.0, "completion_length": 357.1875, "epoch": 1.548828125, "grad_norm": 0.9587044136725793, "kl": 0.0552978515625, "learning_rate": 6.129150390624999e-07, "loss": 0.0022, "reward": 1.7184346914291382, "reward_std": 0.11068252101540565, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7340596914291382, "step": 3171 }, { "clip_ratio": 0.0, "completion_length": 345.40625, "epoch": 1.54931640625, "grad_norm": 5.169780841895419, "kl": 0.06201171875, "learning_rate": 6.1279296875e-07, "loss": 0.0025, "reward": 1.6911205053329468, "reward_std": 0.1017858237028122, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7067455053329468, "step": 3172 }, { "clip_ratio": 0.0, "completion_length": 303.140625, "epoch": 1.5498046875, "grad_norm": 4.360943076089211, "kl": 0.058349609375, "learning_rate": 6.126708984375e-07, "loss": 0.0023, "reward": 1.6890897750854492, "reward_std": 0.1352338343858719, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.7203397750854492, "step": 3173 }, { "clip_ratio": 0.0, "completion_length": 309.5703125, "epoch": 1.55029296875, "grad_norm": 4.05868490070331, "kl": 0.058837890625, "learning_rate": 6.12548828125e-07, "loss": 0.0024, "reward": 1.8603836297988892, "reward_std": 0.11744150519371033, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8681961894035339, "step": 3174 }, { "clip_ratio": 0.0, "completion_length": 253.578125, "epoch": 1.55078125, "grad_norm": 3.0900057199687456, "kl": 0.063720703125, "learning_rate": 6.124267578125e-07, "loss": 0.0026, "reward": 1.7822343111038208, "reward_std": 0.04202779196202755, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7822343111038208, "step": 3175 }, { "clip_ratio": 0.0, "completion_length": 293.3671875, "epoch": 1.55126953125, "grad_norm": 1.3501893344640266, "kl": 0.0716552734375, "learning_rate": 6.123046875e-07, "loss": 0.0029, "reward": 1.9244567155838013, "reward_std": 0.13922565057873726, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.9478942155838013, "step": 3176 }, { "clip_ratio": 0.0, "completion_length": 296.8984375, "epoch": 1.5517578125, "grad_norm": 1.423254075976608, "kl": 0.059326171875, "learning_rate": 6.121826171874999e-07, "loss": 0.0024, "reward": 1.890386700630188, "reward_std": 0.09997991472482681, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8981991708278656, "step": 3177 }, { "clip_ratio": 0.0, "completion_length": 215.5546875, "epoch": 1.55224609375, "grad_norm": 0.7915773476493393, "kl": 0.0577392578125, "learning_rate": 6.120605468749999e-07, "loss": 0.0023, "reward": 1.784228265285492, "reward_std": 0.016746554523706436, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7842282950878143, "step": 3178 }, { "clip_ratio": 0.0, "completion_length": 320.2734375, "epoch": 1.552734375, "grad_norm": 1.05922557400982, "kl": 0.0587158203125, "learning_rate": 6.119384765625e-07, "loss": 0.0023, "reward": 1.7570964097976685, "reward_std": 0.12910258024930954, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7805339395999908, "step": 3179 }, { "clip_ratio": 0.0, "completion_length": 333.40625, "epoch": 1.55322265625, "grad_norm": 1.6684105733598493, "kl": 0.0655517578125, "learning_rate": 6.1181640625e-07, "loss": 0.0026, "reward": 1.5738105773925781, "reward_std": 0.14181802049279213, "rewards/format_reward": 0.921875, "rewards/ocr_reward": 0.6519355773925781, "step": 3180 }, { "clip_ratio": 0.0, "completion_length": 233.9921875, "epoch": 1.5537109375, "grad_norm": 1.1439095055320303, "kl": 0.0625, "learning_rate": 6.116943359375e-07, "loss": 0.0025, "reward": 1.791804313659668, "reward_std": 0.07595885917544365, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.791804313659668, "step": 3181 }, { "clip_ratio": 0.0, "completion_length": 363.046875, "epoch": 1.55419921875, "grad_norm": 1.538031849450602, "kl": 0.06298828125, "learning_rate": 6.11572265625e-07, "loss": 0.0025, "reward": 1.7163517475128174, "reward_std": 0.06050669401884079, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7163517475128174, "step": 3182 }, { "clip_ratio": 0.0, "completion_length": 276.1953125, "epoch": 1.5546875, "grad_norm": 0.7099072244794439, "kl": 0.0516357421875, "learning_rate": 6.114501953124999e-07, "loss": 0.0021, "reward": 1.7604435086250305, "reward_std": 0.019243311136960983, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7604435384273529, "step": 3183 }, { "clip_ratio": 0.0, "completion_length": 309.6171875, "epoch": 1.55517578125, "grad_norm": 1.464970168168766, "kl": 0.068115234375, "learning_rate": 6.113281249999999e-07, "loss": 0.0027, "reward": 1.7117717266082764, "reward_std": 0.03939279168844223, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7117717266082764, "step": 3184 }, { "clip_ratio": 0.0, "completion_length": 357.2890625, "epoch": 1.5556640625, "grad_norm": 2.4843836102727055, "kl": 0.0599365234375, "learning_rate": 6.112060546875e-07, "loss": 0.0024, "reward": 1.7494711875915527, "reward_std": 0.16199829429388046, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7729085981845856, "step": 3185 }, { "clip_ratio": 0.0, "completion_length": 230.5390625, "epoch": 1.55615234375, "grad_norm": 2.4069985981517785, "kl": 0.062255859375, "learning_rate": 6.11083984375e-07, "loss": 0.0025, "reward": 1.844676434993744, "reward_std": 0.07243941724300385, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8446764647960663, "step": 3186 }, { "clip_ratio": 0.0, "completion_length": 284.4765625, "epoch": 1.556640625, "grad_norm": 1.9087556018788279, "kl": 0.0859375, "learning_rate": 6.109619140625e-07, "loss": 0.0034, "reward": 1.7857062220573425, "reward_std": 0.04655470885336399, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7857063114643097, "step": 3187 }, { "clip_ratio": 0.0, "completion_length": 452.625, "epoch": 1.55712890625, "grad_norm": 2.7113439900767484, "kl": 0.0533447265625, "learning_rate": 6.1083984375e-07, "loss": 0.0021, "reward": 1.6991124153137207, "reward_std": 0.18883011117577553, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7459874153137207, "step": 3188 }, { "clip_ratio": 0.0, "completion_length": 426.1875, "epoch": 1.5576171875, "grad_norm": 5.941979108249293, "kl": 0.0625, "learning_rate": 6.107177734375e-07, "loss": 0.0025, "reward": 1.680684208869934, "reward_std": 0.12884881347417831, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6963091790676117, "step": 3189 }, { "clip_ratio": 0.0, "completion_length": 287.421875, "epoch": 1.55810546875, "grad_norm": 1.2343491223909973, "kl": 0.069580078125, "learning_rate": 6.105957031249999e-07, "loss": 0.0028, "reward": 1.816174864768982, "reward_std": 0.03796030767261982, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8161748945713043, "step": 3190 }, { "clip_ratio": 0.0, "completion_length": 309.4453125, "epoch": 1.55859375, "grad_norm": 5.34134665833269, "kl": 0.053955078125, "learning_rate": 6.104736328124999e-07, "loss": 0.0022, "reward": 1.7767646312713623, "reward_std": 0.05293313413858414, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7767646312713623, "step": 3191 }, { "clip_ratio": 0.0, "completion_length": 269.59375, "epoch": 1.55908203125, "grad_norm": 1.857327467302615, "kl": 0.070556640625, "learning_rate": 6.103515625e-07, "loss": 0.0028, "reward": 1.6192150712013245, "reward_std": 0.04519081301987171, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6192150712013245, "step": 3192 }, { "clip_ratio": 0.0, "completion_length": 276.8125, "epoch": 1.5595703125, "grad_norm": 0.7278409601873849, "kl": 0.061279296875, "learning_rate": 6.102294921875e-07, "loss": 0.0025, "reward": 1.6941944360733032, "reward_std": 0.11231286264955997, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7098194360733032, "step": 3193 }, { "clip_ratio": 0.0, "completion_length": 276.4765625, "epoch": 1.56005859375, "grad_norm": 1.4439461511082938, "kl": 0.0611572265625, "learning_rate": 6.10107421875e-07, "loss": 0.0024, "reward": 1.7672069072723389, "reward_std": 0.029297824949026108, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7672069072723389, "step": 3194 }, { "clip_ratio": 0.0, "completion_length": 303.90625, "epoch": 1.560546875, "grad_norm": 1.5977665784586512, "kl": 0.06640625, "learning_rate": 6.099853515625e-07, "loss": 0.0027, "reward": 1.8012299537658691, "reward_std": 0.07727400679141283, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8012299239635468, "step": 3195 }, { "clip_ratio": 0.0, "completion_length": 306.2265625, "epoch": 1.56103515625, "grad_norm": 5.699754950304771, "kl": 0.082763671875, "learning_rate": 6.0986328125e-07, "loss": 0.0033, "reward": 1.669058918952942, "reward_std": 0.07229340635240078, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6690589189529419, "step": 3196 }, { "clip_ratio": 0.0, "completion_length": 291.6875, "epoch": 1.5615234375, "grad_norm": 1.7331866102593816, "kl": 0.060791015625, "learning_rate": 6.097412109374999e-07, "loss": 0.0024, "reward": 1.8495106101036072, "reward_std": 0.07928337901830673, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8573231399059296, "step": 3197 }, { "clip_ratio": 0.0, "completion_length": 287.046875, "epoch": 1.56201171875, "grad_norm": 1.7502170233469734, "kl": 0.08056640625, "learning_rate": 6.09619140625e-07, "loss": 0.0032, "reward": 1.7798677682876587, "reward_std": 0.07166917249560356, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7798677682876587, "step": 3198 }, { "clip_ratio": 0.0, "completion_length": 281.2421875, "epoch": 1.5625, "grad_norm": 1.6271455308496765, "kl": 0.0672607421875, "learning_rate": 6.094970703125e-07, "loss": 0.0027, "reward": 1.837442696094513, "reward_std": 0.08605869952589273, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.8843176364898682, "step": 3199 }, { "clip_ratio": 0.0, "completion_length": 236.421875, "epoch": 1.56298828125, "grad_norm": 1.858548397213829, "kl": 0.0660400390625, "learning_rate": 6.09375e-07, "loss": 0.0026, "reward": 1.852772295475006, "reward_std": 0.04148021200671792, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8527723550796509, "step": 3200 }, { "clip_ratio": 0.0, "completion_length": 233.1328125, "epoch": 1.5634765625, "grad_norm": 2.830884430833045, "kl": 0.084228515625, "learning_rate": 6.092529296875e-07, "loss": 0.0034, "reward": 1.7869747877120972, "reward_std": 0.031565818935632706, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7869747877120972, "step": 3201 }, { "clip_ratio": 0.0, "completion_length": 251.9140625, "epoch": 1.56396484375, "grad_norm": 1.7092660582715655, "kl": 0.0751953125, "learning_rate": 6.09130859375e-07, "loss": 0.003, "reward": 1.7863489985466003, "reward_std": 0.11272731982171535, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8019739985466003, "step": 3202 }, { "clip_ratio": 0.0, "completion_length": 304.203125, "epoch": 1.564453125, "grad_norm": 4.3119377697610295, "kl": 0.05322265625, "learning_rate": 6.090087890624999e-07, "loss": 0.0021, "reward": 1.8258104920387268, "reward_std": 0.05819419212639332, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8258104920387268, "step": 3203 }, { "clip_ratio": 0.0, "completion_length": 301.03125, "epoch": 1.56494140625, "grad_norm": 1.7927135426124725, "kl": 0.0760498046875, "learning_rate": 6.088867187499999e-07, "loss": 0.003, "reward": 1.7928436994552612, "reward_std": 0.03461040183901787, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7928436994552612, "step": 3204 }, { "clip_ratio": 0.0, "completion_length": 311.4375, "epoch": 1.5654296875, "grad_norm": 2.7188622700332865, "kl": 0.064697265625, "learning_rate": 6.087646484375e-07, "loss": 0.0026, "reward": 1.7308800220489502, "reward_std": 0.15542292036116123, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7465050220489502, "step": 3205 }, { "clip_ratio": 0.0, "completion_length": 251.9921875, "epoch": 1.56591796875, "grad_norm": 1.020499144334956, "kl": 0.060546875, "learning_rate": 6.08642578125e-07, "loss": 0.0024, "reward": 1.7392455339431763, "reward_std": 0.11669945158064365, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7548705637454987, "step": 3206 }, { "clip_ratio": 0.0, "completion_length": 296.625, "epoch": 1.56640625, "grad_norm": 2.073316599502687, "kl": 0.08447265625, "learning_rate": 6.085205078125e-07, "loss": 0.0034, "reward": 1.78858482837677, "reward_std": 0.05400579236447811, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.78858482837677, "step": 3207 }, { "clip_ratio": 0.0, "completion_length": 265.953125, "epoch": 1.56689453125, "grad_norm": 1.0320420379679651, "kl": 0.0703125, "learning_rate": 6.083984375e-07, "loss": 0.0028, "reward": 1.7168057560920715, "reward_std": 0.08419827371835709, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7246182560920715, "step": 3208 }, { "clip_ratio": 0.0, "completion_length": 273.953125, "epoch": 1.5673828125, "grad_norm": 1.1431633568387154, "kl": 0.080810546875, "learning_rate": 6.082763671875e-07, "loss": 0.0032, "reward": 1.8084399104118347, "reward_std": 0.05502317473292351, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8084399402141571, "step": 3209 }, { "clip_ratio": 0.0, "completion_length": 239.203125, "epoch": 1.56787109375, "grad_norm": 3.5444712422473756, "kl": 0.07470703125, "learning_rate": 6.081542968749999e-07, "loss": 0.003, "reward": 1.7140299677848816, "reward_std": 0.04421941842883825, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.714029997587204, "step": 3210 }, { "clip_ratio": 0.0, "completion_length": 240.7421875, "epoch": 1.568359375, "grad_norm": 1.3334847785571438, "kl": 0.087646484375, "learning_rate": 6.080322265625e-07, "loss": 0.0035, "reward": 1.7436646223068237, "reward_std": 0.05065160011872649, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7436646223068237, "step": 3211 }, { "clip_ratio": 0.0, "completion_length": 358.65625, "epoch": 1.56884765625, "grad_norm": 2.331640767662747, "kl": 0.068115234375, "learning_rate": 6.0791015625e-07, "loss": 0.0027, "reward": 1.755677580833435, "reward_std": 0.038787453435361385, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7556776106357574, "step": 3212 }, { "clip_ratio": 0.0, "completion_length": 278.65625, "epoch": 1.5693359375, "grad_norm": 1.475076168231399, "kl": 0.07958984375, "learning_rate": 6.077880859375e-07, "loss": 0.0032, "reward": 1.8441100716590881, "reward_std": 0.06590352766215801, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8441100716590881, "step": 3213 }, { "clip_ratio": 0.0, "completion_length": 286.59375, "epoch": 1.56982421875, "grad_norm": 1.3134125044104092, "kl": 0.09912109375, "learning_rate": 6.07666015625e-07, "loss": 0.004, "reward": 1.8156479597091675, "reward_std": 0.07976316474378109, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8234605491161346, "step": 3214 }, { "clip_ratio": 0.0, "completion_length": 255.3046875, "epoch": 1.5703125, "grad_norm": 2.259184238309457, "kl": 0.065673828125, "learning_rate": 6.075439453125e-07, "loss": 0.0026, "reward": 2.002101182937622, "reward_std": 0.05331834591925144, "rewards/format_reward": 1.0, "rewards/ocr_reward": 1.002101182937622, "step": 3215 }, { "clip_ratio": 0.0, "completion_length": 308.3671875, "epoch": 1.57080078125, "grad_norm": 5.468710622222766, "kl": 0.0555419921875, "learning_rate": 6.074218749999999e-07, "loss": 0.0022, "reward": 1.8509008884429932, "reward_std": 0.030736079439520836, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8509008884429932, "step": 3216 }, { "clip_ratio": 0.0, "completion_length": 300.5078125, "epoch": 1.5712890625, "grad_norm": 1.528191106828427, "kl": 0.070068359375, "learning_rate": 6.072998046874999e-07, "loss": 0.0028, "reward": 1.7934442162513733, "reward_std": 0.017778453417122364, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7934441566467285, "step": 3217 }, { "clip_ratio": 0.0, "completion_length": 292.8671875, "epoch": 1.57177734375, "grad_norm": 2.864106160888729, "kl": 0.0616455078125, "learning_rate": 6.07177734375e-07, "loss": 0.0025, "reward": 1.809788703918457, "reward_std": 0.1024474948644638, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8176011443138123, "step": 3218 }, { "clip_ratio": 0.0, "completion_length": 321.59375, "epoch": 1.572265625, "grad_norm": 3.8916992472152545, "kl": 0.071533203125, "learning_rate": 6.070556640625e-07, "loss": 0.0029, "reward": 1.6070039868354797, "reward_std": 0.08922014944255352, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.661691427230835, "step": 3219 }, { "clip_ratio": 0.0, "completion_length": 277.90625, "epoch": 1.57275390625, "grad_norm": 2.5009858382320638, "kl": 0.079833984375, "learning_rate": 6.0693359375e-07, "loss": 0.0032, "reward": 1.78548663854599, "reward_std": 0.062262922525405884, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.78548663854599, "step": 3220 }, { "clip_ratio": 0.0, "completion_length": 238.3203125, "epoch": 1.5732421875, "grad_norm": 0.6996858384501395, "kl": 0.079833984375, "learning_rate": 6.068115234375e-07, "loss": 0.0032, "reward": 1.783010184764862, "reward_std": 0.04869150370359421, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7908226549625397, "step": 3221 }, { "clip_ratio": 0.0, "completion_length": 296.109375, "epoch": 1.57373046875, "grad_norm": 1.1006856780390493, "kl": 0.0555419921875, "learning_rate": 6.06689453125e-07, "loss": 0.0022, "reward": 1.8491575717926025, "reward_std": 0.05759404879063368, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8491575717926025, "step": 3222 }, { "clip_ratio": 0.0, "completion_length": 286.3515625, "epoch": 1.57421875, "grad_norm": 1.1841499529105677, "kl": 0.07666015625, "learning_rate": 6.065673828124999e-07, "loss": 0.0031, "reward": 1.8020890951156616, "reward_std": 0.036463672295212746, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.802089124917984, "step": 3223 }, { "clip_ratio": 0.0, "completion_length": 286.2421875, "epoch": 1.57470703125, "grad_norm": 0.7286182525692657, "kl": 0.0599365234375, "learning_rate": 6.064453125e-07, "loss": 0.0024, "reward": 1.7325817942619324, "reward_std": 0.027954386197961867, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7325817942619324, "step": 3224 }, { "clip_ratio": 0.0, "completion_length": 297.71875, "epoch": 1.5751953125, "grad_norm": 1.237495827483586, "kl": 0.06689453125, "learning_rate": 6.063232421875e-07, "loss": 0.0027, "reward": 1.7775406241416931, "reward_std": 0.08435166534036398, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8009780943393707, "step": 3225 }, { "clip_ratio": 0.0, "completion_length": 376.90625, "epoch": 1.57568359375, "grad_norm": 0.528238479149417, "kl": 0.0548095703125, "learning_rate": 6.06201171875e-07, "loss": 0.0022, "reward": 1.8354755640029907, "reward_std": 0.14160921424627304, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8511005938053131, "step": 3226 }, { "clip_ratio": 0.0, "completion_length": 282.484375, "epoch": 1.576171875, "grad_norm": 1.8220474361249062, "kl": 0.0606689453125, "learning_rate": 6.060791015625e-07, "loss": 0.0024, "reward": 1.8181970715522766, "reward_std": 0.13987145572900772, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8338220417499542, "step": 3227 }, { "clip_ratio": 0.0, "completion_length": 335.859375, "epoch": 1.57666015625, "grad_norm": 0.6962390678727239, "kl": 0.0594482421875, "learning_rate": 6.0595703125e-07, "loss": 0.0024, "reward": 1.7669880390167236, "reward_std": 0.06059642741456628, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7826130092144012, "step": 3228 }, { "clip_ratio": 0.0, "completion_length": 300.1640625, "epoch": 1.5771484375, "grad_norm": 1.5005401701296768, "kl": 0.076904296875, "learning_rate": 6.058349609374999e-07, "loss": 0.0031, "reward": 1.8269048929214478, "reward_std": 0.05390936695039272, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8269048929214478, "step": 3229 }, { "clip_ratio": 0.0, "completion_length": 305.5234375, "epoch": 1.57763671875, "grad_norm": 16.482646426702743, "kl": 0.069580078125, "learning_rate": 6.057128906249999e-07, "loss": 0.0028, "reward": 1.763689935207367, "reward_std": 0.19288001954555511, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7871273458003998, "step": 3230 }, { "clip_ratio": 0.0, "completion_length": 310.7265625, "epoch": 1.578125, "grad_norm": 3.7445612059262894, "kl": 0.0947265625, "learning_rate": 6.055908203125e-07, "loss": 0.0038, "reward": 1.7791760563850403, "reward_std": 0.03553357906639576, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7791760265827179, "step": 3231 }, { "clip_ratio": 0.0, "completion_length": 318.734375, "epoch": 1.57861328125, "grad_norm": 15.055576583772224, "kl": 0.0604248046875, "learning_rate": 6.0546875e-07, "loss": 0.0024, "reward": 1.8308890461921692, "reward_std": 0.10220515914261341, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8387015163898468, "step": 3232 }, { "clip_ratio": 0.0, "completion_length": 333.4609375, "epoch": 1.5791015625, "grad_norm": 0.9772827198196143, "kl": 0.07421875, "learning_rate": 6.053466796875e-07, "loss": 0.003, "reward": 1.8157562017440796, "reward_std": 0.1360500417649746, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8313812017440796, "step": 3233 }, { "clip_ratio": 0.0, "completion_length": 270.796875, "epoch": 1.57958984375, "grad_norm": 0.6814418196014778, "kl": 0.09228515625, "learning_rate": 6.05224609375e-07, "loss": 0.0037, "reward": 1.7003534436225891, "reward_std": 0.06812034081667662, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7081659436225891, "step": 3234 }, { "clip_ratio": 0.0, "completion_length": 258.5078125, "epoch": 1.580078125, "grad_norm": 4.059148446029115, "kl": 0.073974609375, "learning_rate": 6.051025390625e-07, "loss": 0.003, "reward": 1.7905691862106323, "reward_std": 0.053706713020801544, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7905691266059875, "step": 3235 }, { "clip_ratio": 0.0, "completion_length": 289.6328125, "epoch": 1.58056640625, "grad_norm": 0.89045191140724, "kl": 0.057861328125, "learning_rate": 6.049804687499999e-07, "loss": 0.0023, "reward": 1.9589157104492188, "reward_std": 0.05596003495156765, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9589157104492188, "step": 3236 }, { "clip_ratio": 0.0, "completion_length": 267.671875, "epoch": 1.5810546875, "grad_norm": 27.98655992203742, "kl": 0.091796875, "learning_rate": 6.048583984375e-07, "loss": 0.0037, "reward": 1.742477536201477, "reward_std": 0.09132163226604462, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.742477536201477, "step": 3237 }, { "clip_ratio": 0.0, "completion_length": 311.78125, "epoch": 1.58154296875, "grad_norm": 1.8475477481704758, "kl": 0.067626953125, "learning_rate": 6.04736328125e-07, "loss": 0.0027, "reward": 1.7812891602516174, "reward_std": 0.13560626655817032, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7969141900539398, "step": 3238 }, { "clip_ratio": 0.0, "completion_length": 281.109375, "epoch": 1.58203125, "grad_norm": 1.6327596150343278, "kl": 0.087158203125, "learning_rate": 6.046142578125e-07, "loss": 0.0035, "reward": 1.799069106578827, "reward_std": 0.057514723390340805, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7990691363811493, "step": 3239 }, { "clip_ratio": 0.0, "completion_length": 323.203125, "epoch": 1.58251953125, "grad_norm": 2.100124971039901, "kl": 0.0732421875, "learning_rate": 6.044921875e-07, "loss": 0.0029, "reward": 1.7056349515914917, "reward_std": 0.021463132463395596, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7056349515914917, "step": 3240 }, { "clip_ratio": 0.0, "completion_length": 286.109375, "epoch": 1.5830078125, "grad_norm": 0.9123360385826916, "kl": 0.0704345703125, "learning_rate": 6.043701171875e-07, "loss": 0.0028, "reward": 1.8157188296318054, "reward_std": 0.06355854496359825, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8157188296318054, "step": 3241 }, { "clip_ratio": 0.0, "completion_length": 247.7421875, "epoch": 1.58349609375, "grad_norm": 3.3904581418786015, "kl": 0.08642578125, "learning_rate": 6.042480468749999e-07, "loss": 0.0035, "reward": 1.8644654154777527, "reward_std": 0.06617464870214462, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8644654750823975, "step": 3242 }, { "clip_ratio": 0.0, "completion_length": 263.0390625, "epoch": 1.583984375, "grad_norm": 2.5280802263881643, "kl": 0.06982421875, "learning_rate": 6.041259765624999e-07, "loss": 0.0028, "reward": 1.8594006896018982, "reward_std": 0.03832878777757287, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8672131299972534, "step": 3243 }, { "clip_ratio": 0.0, "completion_length": 235.0625, "epoch": 1.58447265625, "grad_norm": 1.7853672075638696, "kl": 0.09375, "learning_rate": 6.0400390625e-07, "loss": 0.0037, "reward": 1.7820322513580322, "reward_std": 0.03775404021143913, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7820322811603546, "step": 3244 }, { "clip_ratio": 0.0, "completion_length": 211.4609375, "epoch": 1.5849609375, "grad_norm": 3.1479729709522544, "kl": 0.09033203125, "learning_rate": 6.038818359375e-07, "loss": 0.0036, "reward": 1.828788161277771, "reward_std": 0.029659430496394634, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8287881314754486, "step": 3245 }, { "clip_ratio": 0.0, "completion_length": 263.7734375, "epoch": 1.58544921875, "grad_norm": 1.8693724346947198, "kl": 0.091552734375, "learning_rate": 6.03759765625e-07, "loss": 0.0037, "reward": 1.6129669547080994, "reward_std": 0.051816992461681366, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6129669547080994, "step": 3246 }, { "clip_ratio": 0.0, "completion_length": 225.0625, "epoch": 1.5859375, "grad_norm": 2.4589967009937026, "kl": 0.0771484375, "learning_rate": 6.036376953125e-07, "loss": 0.0031, "reward": 1.776804268360138, "reward_std": 0.04013761132955551, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7768042683601379, "step": 3247 }, { "clip_ratio": 0.0, "completion_length": 303.078125, "epoch": 1.58642578125, "grad_norm": 0.9274776553938363, "kl": 0.080322265625, "learning_rate": 6.03515625e-07, "loss": 0.0032, "reward": 1.7535163760185242, "reward_std": 0.046172965317964554, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7535163760185242, "step": 3248 }, { "clip_ratio": 0.0, "completion_length": 276.0859375, "epoch": 1.5869140625, "grad_norm": 1.1471100947644117, "kl": 0.07421875, "learning_rate": 6.033935546874999e-07, "loss": 0.003, "reward": 1.738187551498413, "reward_std": 0.0872982544824481, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7460000813007355, "step": 3249 }, { "clip_ratio": 0.0, "completion_length": 240.390625, "epoch": 1.58740234375, "grad_norm": 0.9499426055546841, "kl": 0.0634765625, "learning_rate": 6.032714843749999e-07, "loss": 0.0025, "reward": 1.8778213262557983, "reward_std": 0.08278293255716562, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.9012588858604431, "step": 3250 }, { "clip_ratio": 0.0, "completion_length": 282.3984375, "epoch": 1.587890625, "grad_norm": 2.2261842170424577, "kl": 0.080322265625, "learning_rate": 6.031494140625e-07, "loss": 0.0032, "reward": 1.7778486013412476, "reward_std": 0.0640218211337924, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7856611609458923, "step": 3251 }, { "clip_ratio": 0.0, "completion_length": 272.21875, "epoch": 1.58837890625, "grad_norm": 1.2139552367739217, "kl": 0.0704345703125, "learning_rate": 6.0302734375e-07, "loss": 0.0028, "reward": 1.8429046869277954, "reward_std": 0.028244564309716225, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8429047465324402, "step": 3252 }, { "clip_ratio": 0.0, "completion_length": 346.8828125, "epoch": 1.5888671875, "grad_norm": 1.4532070226696283, "kl": 0.0849609375, "learning_rate": 6.029052734375e-07, "loss": 0.0034, "reward": 1.6911569833755493, "reward_std": 0.1081528514623642, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6989694237709045, "step": 3253 }, { "clip_ratio": 0.0, "completion_length": 195.234375, "epoch": 1.58935546875, "grad_norm": 1.2377545429292913, "kl": 0.066650390625, "learning_rate": 6.02783203125e-07, "loss": 0.0027, "reward": 1.853829026222229, "reward_std": 0.02524241991341114, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8538290560245514, "step": 3254 }, { "clip_ratio": 0.0, "completion_length": 243.6328125, "epoch": 1.58984375, "grad_norm": 1.0689435406627494, "kl": 0.07568359375, "learning_rate": 6.026611328124999e-07, "loss": 0.003, "reward": 1.7502402663230896, "reward_std": 0.08583210222423077, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7580527365207672, "step": 3255 }, { "clip_ratio": 0.0, "completion_length": 333.609375, "epoch": 1.59033203125, "grad_norm": 2.129247438581581, "kl": 0.07275390625, "learning_rate": 6.025390624999999e-07, "loss": 0.0029, "reward": 1.6336244344711304, "reward_std": 0.08128929510712624, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6336244642734528, "step": 3256 }, { "clip_ratio": 0.0, "completion_length": 234.0625, "epoch": 1.5908203125, "grad_norm": 2.9401186862267634, "kl": 0.076904296875, "learning_rate": 6.024169921875e-07, "loss": 0.0031, "reward": 1.7974181175231934, "reward_std": 0.03139576967805624, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7974181473255157, "step": 3257 }, { "clip_ratio": 0.0, "completion_length": 279.015625, "epoch": 1.59130859375, "grad_norm": 1.7758066245944604, "kl": 0.0751953125, "learning_rate": 6.02294921875e-07, "loss": 0.003, "reward": 1.7531208395957947, "reward_std": 0.02848457545042038, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7531208395957947, "step": 3258 }, { "clip_ratio": 0.0, "completion_length": 325.859375, "epoch": 1.591796875, "grad_norm": 6.7846858101834435, "kl": 0.0712890625, "learning_rate": 6.021728515625e-07, "loss": 0.0029, "reward": 1.7426277995109558, "reward_std": 0.06002306379377842, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.742627739906311, "step": 3259 }, { "clip_ratio": 0.0, "completion_length": 286.5625, "epoch": 1.59228515625, "grad_norm": 3.355150773217038, "kl": 0.0810546875, "learning_rate": 6.0205078125e-07, "loss": 0.0032, "reward": 1.913890540599823, "reward_std": 0.041608670726418495, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9138904809951782, "step": 3260 }, { "clip_ratio": 0.0, "completion_length": 235.2421875, "epoch": 1.5927734375, "grad_norm": 0.9105120534656213, "kl": 0.0877685546875, "learning_rate": 6.019287109375e-07, "loss": 0.0035, "reward": 1.8028390407562256, "reward_std": 0.07113232091069221, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8028390407562256, "step": 3261 }, { "clip_ratio": 0.0, "completion_length": 327.703125, "epoch": 1.59326171875, "grad_norm": 0.9245025145271322, "kl": 0.07470703125, "learning_rate": 6.018066406249999e-07, "loss": 0.003, "reward": 1.671428918838501, "reward_std": 0.05243074335157871, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6714289486408234, "step": 3262 }, { "clip_ratio": 0.0, "completion_length": 326.9765625, "epoch": 1.59375, "grad_norm": 2.130691763806817, "kl": 0.06982421875, "learning_rate": 6.016845703124999e-07, "loss": 0.0028, "reward": 1.7941365838050842, "reward_std": 0.09248049557209015, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.809761643409729, "step": 3263 }, { "clip_ratio": 0.0, "completion_length": 305.8828125, "epoch": 1.59423828125, "grad_norm": 1.4565066323176799, "kl": 0.0625, "learning_rate": 6.015625e-07, "loss": 0.0025, "reward": 1.7813687324523926, "reward_std": 0.07458901032805443, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7891812920570374, "step": 3264 }, { "clip_ratio": 0.0, "completion_length": 293.3359375, "epoch": 1.5947265625, "grad_norm": 1.0694645510879401, "kl": 0.068115234375, "learning_rate": 6.014404296875e-07, "loss": 0.0027, "reward": 1.7209742665290833, "reward_std": 0.12353447079658508, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7678492069244385, "step": 3265 }, { "clip_ratio": 0.0, "completion_length": 330.6171875, "epoch": 1.59521484375, "grad_norm": 2.180425663820791, "kl": 0.074951171875, "learning_rate": 6.01318359375e-07, "loss": 0.003, "reward": 1.7168704271316528, "reward_std": 0.10325317457318306, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7324954271316528, "step": 3266 }, { "clip_ratio": 0.0, "completion_length": 325.9140625, "epoch": 1.595703125, "grad_norm": 2.791039159238429, "kl": 0.0684814453125, "learning_rate": 6.011962890625e-07, "loss": 0.0027, "reward": 1.7321181297302246, "reward_std": 0.10391049832105637, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.786805659532547, "step": 3267 }, { "clip_ratio": 0.0, "completion_length": 370.0078125, "epoch": 1.59619140625, "grad_norm": 0.8947974535479477, "kl": 0.0577392578125, "learning_rate": 6.010742187499999e-07, "loss": 0.0023, "reward": 1.8403544425964355, "reward_std": 0.055374979972839355, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8481669425964355, "step": 3268 }, { "clip_ratio": 0.0, "completion_length": 319.0078125, "epoch": 1.5966796875, "grad_norm": 1.1528503859568437, "kl": 0.07568359375, "learning_rate": 6.009521484374999e-07, "loss": 0.003, "reward": 1.6485916376113892, "reward_std": 0.018121136352419853, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6485916376113892, "step": 3269 }, { "clip_ratio": 0.0, "completion_length": 336.5078125, "epoch": 1.59716796875, "grad_norm": 3.0563137616187652, "kl": 0.0594482421875, "learning_rate": 6.00830078125e-07, "loss": 0.0024, "reward": 1.7807026505470276, "reward_std": 0.1419503539800644, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7807026207447052, "step": 3270 }, { "clip_ratio": 0.0, "completion_length": 249.140625, "epoch": 1.59765625, "grad_norm": 1.335457862432662, "kl": 0.084228515625, "learning_rate": 6.007080078125e-07, "loss": 0.0034, "reward": 1.797263503074646, "reward_std": 0.06179828941822052, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.797263503074646, "step": 3271 }, { "clip_ratio": 0.0, "completion_length": 258.609375, "epoch": 1.59814453125, "grad_norm": 1.121524259737413, "kl": 0.06689453125, "learning_rate": 6.005859375e-07, "loss": 0.0027, "reward": 1.7773959040641785, "reward_std": 0.05081337783485651, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7773959338665009, "step": 3272 }, { "clip_ratio": 0.0, "completion_length": 290.15625, "epoch": 1.5986328125, "grad_norm": 2.866249679866173, "kl": 0.0645751953125, "learning_rate": 6.004638671875e-07, "loss": 0.0026, "reward": 1.8175668716430664, "reward_std": 0.11312521249055862, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8253794610500336, "step": 3273 }, { "clip_ratio": 0.0, "completion_length": 313.453125, "epoch": 1.59912109375, "grad_norm": 3.606065177774678, "kl": 0.0640869140625, "learning_rate": 6.00341796875e-07, "loss": 0.0026, "reward": 1.781424641609192, "reward_std": 0.08653675019741058, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7814246118068695, "step": 3274 }, { "clip_ratio": 0.0, "completion_length": 367.3828125, "epoch": 1.599609375, "grad_norm": 3.5178633189395523, "kl": 0.08203125, "learning_rate": 6.002197265624999e-07, "loss": 0.0033, "reward": 1.7668121457099915, "reward_std": 0.126564159989357, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7824371457099915, "step": 3275 }, { "clip_ratio": 0.0, "completion_length": 368.171875, "epoch": 1.60009765625, "grad_norm": 0.6996774519584336, "kl": 0.0472412109375, "learning_rate": 6.000976562499999e-07, "loss": 0.0019, "reward": 1.814025104045868, "reward_std": 0.12107747420668602, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.8609001040458679, "step": 3276 }, { "clip_ratio": 0.0, "completion_length": 388.75, "epoch": 1.6005859375, "grad_norm": 0.8261108370026679, "kl": 0.0513916015625, "learning_rate": 5.999755859375e-07, "loss": 0.0021, "reward": 1.7932913899421692, "reward_std": 0.032305057160556316, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7932913601398468, "step": 3277 }, { "clip_ratio": 0.0, "completion_length": 263.3203125, "epoch": 1.60107421875, "grad_norm": 2.058098522878682, "kl": 0.0830078125, "learning_rate": 5.99853515625e-07, "loss": 0.0033, "reward": 1.8758089542388916, "reward_std": 0.0519051980227232, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8758089244365692, "step": 3278 }, { "clip_ratio": 0.0, "completion_length": 378.4765625, "epoch": 1.6015625, "grad_norm": 3.9373950604712626, "kl": 0.0494384765625, "learning_rate": 5.997314453125e-07, "loss": 0.002, "reward": 1.8489559888839722, "reward_std": 0.04422624595463276, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8489560484886169, "step": 3279 }, { "clip_ratio": 0.0, "completion_length": 295.7265625, "epoch": 1.60205078125, "grad_norm": 7.884811127959207, "kl": 0.0703125, "learning_rate": 5.99609375e-07, "loss": 0.0028, "reward": 1.6899959444999695, "reward_std": 0.10859640687704086, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6978083550930023, "step": 3280 }, { "clip_ratio": 0.0, "completion_length": 419.2265625, "epoch": 1.6025390625, "grad_norm": 2.7975461752500577, "kl": 0.054443359375, "learning_rate": 5.994873046875e-07, "loss": 0.0022, "reward": 1.6196198463439941, "reward_std": 0.1712161898612976, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.6664949059486389, "step": 3281 }, { "clip_ratio": 0.0, "completion_length": 333.0234375, "epoch": 1.60302734375, "grad_norm": 0.9151595270896026, "kl": 0.086181640625, "learning_rate": 5.993652343749999e-07, "loss": 0.0034, "reward": 1.7856322526931763, "reward_std": 0.02972456067800522, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7856322526931763, "step": 3282 }, { "clip_ratio": 0.0, "completion_length": 341.7109375, "epoch": 1.603515625, "grad_norm": 1.7965453423126865, "kl": 0.097900390625, "learning_rate": 5.992431640625e-07, "loss": 0.0039, "reward": 1.7602424621582031, "reward_std": 0.13400599360466003, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7758674323558807, "step": 3283 }, { "clip_ratio": 0.0, "completion_length": 371.90625, "epoch": 1.60400390625, "grad_norm": 1.0639662963187586, "kl": 0.0548095703125, "learning_rate": 5.9912109375e-07, "loss": 0.0022, "reward": 1.8079357147216797, "reward_std": 0.18928005546331406, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8313732445240021, "step": 3284 }, { "clip_ratio": 0.0, "completion_length": 366.953125, "epoch": 1.6044921875, "grad_norm": 1.2197720201061695, "kl": 0.057373046875, "learning_rate": 5.989990234375e-07, "loss": 0.0023, "reward": 1.821268081665039, "reward_std": 0.02508594747632742, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8212681412696838, "step": 3285 }, { "clip_ratio": 0.0, "completion_length": 353.609375, "epoch": 1.60498046875, "grad_norm": 1.4231911127812176, "kl": 0.0611572265625, "learning_rate": 5.98876953125e-07, "loss": 0.0024, "reward": 1.851391077041626, "reward_std": 0.12025601789355278, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8670159578323364, "step": 3286 }, { "clip_ratio": 0.0, "completion_length": 335.4296875, "epoch": 1.60546875, "grad_norm": 1.911311177993667, "kl": 0.0947265625, "learning_rate": 5.987548828125e-07, "loss": 0.0038, "reward": 1.7131580114364624, "reward_std": 0.07230347953736782, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7131580114364624, "step": 3287 }, { "clip_ratio": 0.0, "completion_length": 372.015625, "epoch": 1.60595703125, "grad_norm": 0.8159968339623103, "kl": 0.0516357421875, "learning_rate": 5.986328124999999e-07, "loss": 0.0021, "reward": 1.7010605335235596, "reward_std": 0.12904052436351776, "rewards/format_reward": 0.953125, "rewards/ocr_reward": 0.7479356527328491, "step": 3288 }, { "clip_ratio": 0.0, "completion_length": 274.125, "epoch": 1.6064453125, "grad_norm": 0.8081713539518741, "kl": 0.072509765625, "learning_rate": 5.985107421874999e-07, "loss": 0.0029, "reward": 1.8329209685325623, "reward_std": 0.02778689656406641, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8329209983348846, "step": 3289 }, { "clip_ratio": 0.0, "completion_length": 326.796875, "epoch": 1.60693359375, "grad_norm": 1.5031271056588227, "kl": 0.072509765625, "learning_rate": 5.98388671875e-07, "loss": 0.0029, "reward": 1.794043481349945, "reward_std": 0.05896776542067528, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7940434813499451, "step": 3290 }, { "clip_ratio": 0.0, "completion_length": 398.7890625, "epoch": 1.607421875, "grad_norm": 1.3872359065707587, "kl": 0.07470703125, "learning_rate": 5.982666015625e-07, "loss": 0.003, "reward": 1.6995200514793396, "reward_std": 0.08452805131673813, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7073325216770172, "step": 3291 }, { "clip_ratio": 0.0, "completion_length": 234.8046875, "epoch": 1.60791015625, "grad_norm": 0.8648030111794723, "kl": 0.078857421875, "learning_rate": 5.9814453125e-07, "loss": 0.0032, "reward": 1.7610323429107666, "reward_std": 0.05742851458489895, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7688448131084442, "step": 3292 }, { "clip_ratio": 0.0, "completion_length": 355.671875, "epoch": 1.6083984375, "grad_norm": 6.153849152464751, "kl": 0.082763671875, "learning_rate": 5.980224609375e-07, "loss": 0.0033, "reward": 1.7795958518981934, "reward_std": 0.07071587443351746, "rewards/format_reward": 0.9453125, "rewards/ocr_reward": 0.8342833817005157, "step": 3293 }, { "clip_ratio": 0.0, "completion_length": 373.34375, "epoch": 1.60888671875, "grad_norm": 1.0323918030975063, "kl": 0.064453125, "learning_rate": 5.97900390625e-07, "loss": 0.0026, "reward": 1.8604564666748047, "reward_std": 0.04785814322531223, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8604564666748047, "step": 3294 }, { "clip_ratio": 0.0, "completion_length": 316.6015625, "epoch": 1.609375, "grad_norm": 1.4849996417871474, "kl": 0.074462890625, "learning_rate": 5.977783203124999e-07, "loss": 0.003, "reward": 1.703747808933258, "reward_std": 0.08227039128541946, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7037478089332581, "step": 3295 }, { "clip_ratio": 0.0, "completion_length": 360.125, "epoch": 1.60986328125, "grad_norm": 2.1214209431752122, "kl": 0.06298828125, "learning_rate": 5.9765625e-07, "loss": 0.0025, "reward": 1.794127881526947, "reward_std": 0.07666090503334999, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7941278219223022, "step": 3296 }, { "clip_ratio": 0.0, "completion_length": 261.859375, "epoch": 1.6103515625, "grad_norm": 1.6164264023490769, "kl": 0.086181640625, "learning_rate": 5.975341796875e-07, "loss": 0.0035, "reward": 1.6789074540138245, "reward_std": 0.033364531584084034, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6789074838161469, "step": 3297 }, { "clip_ratio": 0.0, "completion_length": 333.9453125, "epoch": 1.61083984375, "grad_norm": 1.8073919332154889, "kl": 0.076171875, "learning_rate": 5.97412109375e-07, "loss": 0.003, "reward": 1.6800431609153748, "reward_std": 0.06180498003959656, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6800430715084076, "step": 3298 }, { "clip_ratio": 0.0, "completion_length": 305.15625, "epoch": 1.611328125, "grad_norm": 1.9650430682434774, "kl": 0.0849609375, "learning_rate": 5.972900390625e-07, "loss": 0.0034, "reward": 1.720855951309204, "reward_std": 0.08976828306913376, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7442933619022369, "step": 3299 }, { "clip_ratio": 0.0, "completion_length": 244.953125, "epoch": 1.61181640625, "grad_norm": 1.5026385468711767, "kl": 0.07373046875, "learning_rate": 5.9716796875e-07, "loss": 0.0029, "reward": 1.7150686383247375, "reward_std": 0.10774907097220421, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7228811085224152, "step": 3300 }, { "clip_ratio": 0.0, "completion_length": 187.9609375, "epoch": 1.6123046875, "grad_norm": 3.9327500342779778, "kl": 0.08203125, "learning_rate": 5.970458984374999e-07, "loss": 0.0033, "reward": 1.7554203271865845, "reward_std": 0.09172924142330885, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7554203867912292, "step": 3301 }, { "clip_ratio": 0.0, "completion_length": 237.0546875, "epoch": 1.61279296875, "grad_norm": 2.715132927951165, "kl": 0.083251953125, "learning_rate": 5.969238281249999e-07, "loss": 0.0033, "reward": 1.752245843410492, "reward_std": 0.0424564378336072, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7522459030151367, "step": 3302 }, { "clip_ratio": 0.0, "completion_length": 195.9375, "epoch": 1.61328125, "grad_norm": 0.9299085475576804, "kl": 0.067626953125, "learning_rate": 5.968017578125e-07, "loss": 0.0027, "reward": 1.815238118171692, "reward_std": 0.04331210441887379, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8152380585670471, "step": 3303 }, { "clip_ratio": 0.0, "completion_length": 269.59375, "epoch": 1.61376953125, "grad_norm": 1.4842707399360437, "kl": 0.07080078125, "learning_rate": 5.966796875e-07, "loss": 0.0028, "reward": 1.6607686877250671, "reward_std": 0.0442405054345727, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6607686877250671, "step": 3304 }, { "clip_ratio": 0.0, "completion_length": 287.65625, "epoch": 1.6142578125, "grad_norm": 2.0590875753569335, "kl": 0.064697265625, "learning_rate": 5.965576171875e-07, "loss": 0.0026, "reward": 1.8585594296455383, "reward_std": 0.05867746938019991, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8585593700408936, "step": 3305 }, { "clip_ratio": 0.0, "completion_length": 317.8125, "epoch": 1.61474609375, "grad_norm": 1.6223776518735307, "kl": 0.089599609375, "learning_rate": 5.96435546875e-07, "loss": 0.0036, "reward": 1.7585085034370422, "reward_std": 0.055630091577768326, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.758508563041687, "step": 3306 }, { "clip_ratio": 0.0, "completion_length": 242.3515625, "epoch": 1.615234375, "grad_norm": 6.29346114659625, "kl": 0.0816650390625, "learning_rate": 5.963134765625e-07, "loss": 0.0033, "reward": 1.6852021217346191, "reward_std": 0.030728538520634174, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6852021813392639, "step": 3307 }, { "clip_ratio": 0.0, "completion_length": 229.640625, "epoch": 1.61572265625, "grad_norm": 57.61803616396629, "kl": 0.114501953125, "learning_rate": 5.961914062499999e-07, "loss": 0.0046, "reward": 1.651352047920227, "reward_std": 0.05351191433146596, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6513520181179047, "step": 3308 }, { "clip_ratio": 0.0, "completion_length": 293.0, "epoch": 1.6162109375, "grad_norm": 1.3324160676364192, "kl": 0.0623779296875, "learning_rate": 5.960693359375e-07, "loss": 0.0025, "reward": 1.818449318408966, "reward_std": 0.07615053281188011, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8262618482112885, "step": 3309 }, { "clip_ratio": 0.0, "completion_length": 366.6015625, "epoch": 1.61669921875, "grad_norm": 1.5524145516661523, "kl": 0.0604248046875, "learning_rate": 5.95947265625e-07, "loss": 0.0024, "reward": 1.7558764815330505, "reward_std": 0.08649563789367676, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7558764517307281, "step": 3310 }, { "clip_ratio": 0.0, "completion_length": 234.9296875, "epoch": 1.6171875, "grad_norm": 2.2022666625672174, "kl": 0.077880859375, "learning_rate": 5.958251953125e-07, "loss": 0.0031, "reward": 1.9051913619041443, "reward_std": 0.057089509442448616, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9051913619041443, "step": 3311 }, { "clip_ratio": 0.0, "completion_length": 316.3125, "epoch": 1.61767578125, "grad_norm": 2.122944616604469, "kl": 0.087646484375, "learning_rate": 5.95703125e-07, "loss": 0.0035, "reward": 1.7621399760246277, "reward_std": 0.07783204689621925, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7621399760246277, "step": 3312 }, { "clip_ratio": 0.0, "completion_length": 302.375, "epoch": 1.6181640625, "grad_norm": 0.6451018067962863, "kl": 0.092041015625, "learning_rate": 5.955810546875e-07, "loss": 0.0037, "reward": 1.8151302337646484, "reward_std": 0.17584221065044403, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8385677337646484, "step": 3313 }, { "clip_ratio": 0.0, "completion_length": 264.3125, "epoch": 1.61865234375, "grad_norm": 2.9643724571250902, "kl": 0.072021484375, "learning_rate": 5.954589843749999e-07, "loss": 0.0029, "reward": 1.8480368256568909, "reward_std": 0.04931685887277126, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8480368256568909, "step": 3314 }, { "clip_ratio": 0.0, "completion_length": 311.6171875, "epoch": 1.619140625, "grad_norm": 14.73302874665288, "kl": 0.175048828125, "learning_rate": 5.953369140624999e-07, "loss": 0.007, "reward": 1.781773030757904, "reward_std": 0.14042264595627785, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.805210530757904, "step": 3315 }, { "clip_ratio": 0.0, "completion_length": 254.96875, "epoch": 1.61962890625, "grad_norm": 0.8900575904775362, "kl": 0.075927734375, "learning_rate": 5.9521484375e-07, "loss": 0.003, "reward": 1.8554713726043701, "reward_std": 0.06886312644928694, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8632838726043701, "step": 3316 }, { "clip_ratio": 0.0, "completion_length": 212.515625, "epoch": 1.6201171875, "grad_norm": 2.466288409101878, "kl": 0.077880859375, "learning_rate": 5.950927734375e-07, "loss": 0.0031, "reward": 1.7723018527030945, "reward_std": 0.02210051123984158, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7723018527030945, "step": 3317 }, { "clip_ratio": 0.0, "completion_length": 195.59375, "epoch": 1.62060546875, "grad_norm": 2.5984727262943124, "kl": 0.074462890625, "learning_rate": 5.94970703125e-07, "loss": 0.003, "reward": 1.7868224382400513, "reward_std": 0.05945824505761266, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7868223786354065, "step": 3318 }, { "clip_ratio": 0.0, "completion_length": 381.5390625, "epoch": 1.62109375, "grad_norm": 1.4154199137348544, "kl": 0.0694580078125, "learning_rate": 5.948486328125e-07, "loss": 0.0028, "reward": 1.7688942551612854, "reward_std": 0.13842950016260147, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.8079566955566406, "step": 3319 }, { "clip_ratio": 0.0, "completion_length": 244.0078125, "epoch": 1.62158203125, "grad_norm": 0.6836739446073203, "kl": 0.085205078125, "learning_rate": 5.947265625e-07, "loss": 0.0034, "reward": 1.7379599213600159, "reward_std": 0.05289880000054836, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7379598617553711, "step": 3320 }, { "clip_ratio": 0.0, "completion_length": 276.953125, "epoch": 1.6220703125, "grad_norm": 1.1625041958754734, "kl": 0.08544921875, "learning_rate": 5.946044921874999e-07, "loss": 0.0034, "reward": 1.901548981666565, "reward_std": 0.04509174823760986, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9015489816665649, "step": 3321 }, { "clip_ratio": 0.0, "completion_length": 242.4453125, "epoch": 1.62255859375, "grad_norm": 2.346547215431855, "kl": 0.12109375, "learning_rate": 5.94482421875e-07, "loss": 0.0049, "reward": 1.7190340757369995, "reward_std": 0.10668664053082466, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7268466651439667, "step": 3322 }, { "clip_ratio": 0.0, "completion_length": 312.4296875, "epoch": 1.623046875, "grad_norm": 1.7067499710514877, "kl": 0.0728759765625, "learning_rate": 5.943603515625e-07, "loss": 0.0029, "reward": 1.7981135249137878, "reward_std": 0.08968368917703629, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7981135547161102, "step": 3323 }, { "clip_ratio": 0.0, "completion_length": 300.3046875, "epoch": 1.62353515625, "grad_norm": 0.9846710994291399, "kl": 0.071533203125, "learning_rate": 5.9423828125e-07, "loss": 0.0029, "reward": 1.8831993341445923, "reward_std": 0.029867228120565414, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8831993639469147, "step": 3324 }, { "clip_ratio": 0.0, "completion_length": 256.5, "epoch": 1.6240234375, "grad_norm": 2.922443835097143, "kl": 0.08642578125, "learning_rate": 5.941162109375e-07, "loss": 0.0035, "reward": 1.7861003875732422, "reward_std": 0.06693215668201447, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7861004769802094, "step": 3325 }, { "clip_ratio": 0.0, "completion_length": 264.140625, "epoch": 1.62451171875, "grad_norm": 0.6981453405142553, "kl": 0.0657958984375, "learning_rate": 5.93994140625e-07, "loss": 0.0026, "reward": 1.9097455143928528, "reward_std": 0.02694264892488718, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9097454845905304, "step": 3326 }, { "clip_ratio": 0.0, "completion_length": 313.75, "epoch": 1.625, "grad_norm": 1.7535640438867004, "kl": 0.076904296875, "learning_rate": 5.938720703124999e-07, "loss": 0.0031, "reward": 1.8061844110488892, "reward_std": 0.04717784374952316, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8061844110488892, "step": 3327 }, { "clip_ratio": 0.0, "completion_length": 342.1015625, "epoch": 1.62548828125, "grad_norm": 1.2336422112092555, "kl": 0.064697265625, "learning_rate": 5.937499999999999e-07, "loss": 0.0026, "reward": 1.7810336351394653, "reward_std": 0.07749061286449432, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7888461053371429, "step": 3328 }, { "clip_ratio": 0.0, "completion_length": 387.09375, "epoch": 1.6259765625, "grad_norm": 3.487351497648713, "kl": 0.06494140625, "learning_rate": 5.936279296875e-07, "loss": 0.0026, "reward": 1.6700169444084167, "reward_std": 0.17180902510881424, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.693454384803772, "step": 3329 }, { "clip_ratio": 0.0, "completion_length": 313.1875, "epoch": 1.62646484375, "grad_norm": 13.359609968705223, "kl": 0.08935546875, "learning_rate": 5.93505859375e-07, "loss": 0.0036, "reward": 1.6575063467025757, "reward_std": 0.055701796896755695, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6575063467025757, "step": 3330 }, { "clip_ratio": 0.0, "completion_length": 288.3984375, "epoch": 1.626953125, "grad_norm": 0.6032534098055211, "kl": 0.06396484375, "learning_rate": 5.933837890625e-07, "loss": 0.0026, "reward": 1.8520901799201965, "reward_std": 0.07921074330806732, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8599026799201965, "step": 3331 }, { "clip_ratio": 0.0, "completion_length": 312.078125, "epoch": 1.62744140625, "grad_norm": 1.2697462288963357, "kl": 0.08154296875, "learning_rate": 5.9326171875e-07, "loss": 0.0033, "reward": 1.7715474963188171, "reward_std": 0.06629283353686333, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7715475261211395, "step": 3332 }, { "clip_ratio": 0.0, "completion_length": 264.8515625, "epoch": 1.6279296875, "grad_norm": 2.641032389095305, "kl": 0.0672607421875, "learning_rate": 5.931396484375e-07, "loss": 0.0027, "reward": 1.69329833984375, "reward_std": 0.06569128856062889, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6932983696460724, "step": 3333 }, { "clip_ratio": 0.0, "completion_length": 224.78125, "epoch": 1.62841796875, "grad_norm": 2.5623974388990454, "kl": 0.0908203125, "learning_rate": 5.930175781249999e-07, "loss": 0.0036, "reward": 1.8435781002044678, "reward_std": 0.0874359430745244, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8435779809951782, "step": 3334 }, { "clip_ratio": 0.0, "completion_length": 298.828125, "epoch": 1.62890625, "grad_norm": 1.5883862067873453, "kl": 0.088623046875, "learning_rate": 5.928955078125e-07, "loss": 0.0035, "reward": 1.7597174644470215, "reward_std": 0.08109994605183601, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7675299346446991, "step": 3335 }, { "clip_ratio": 0.0, "completion_length": 297.65625, "epoch": 1.62939453125, "grad_norm": 1.0616997874872647, "kl": 0.06640625, "learning_rate": 5.927734375e-07, "loss": 0.0027, "reward": 1.7667133212089539, "reward_std": 0.1313837133347988, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7901508212089539, "step": 3336 }, { "clip_ratio": 0.0, "completion_length": 255.1875, "epoch": 1.6298828125, "grad_norm": 2.9226107751812354, "kl": 0.1103515625, "learning_rate": 5.926513671875e-07, "loss": 0.0044, "reward": 1.6865645051002502, "reward_std": 0.06128368899226189, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6865644752979279, "step": 3337 }, { "clip_ratio": 0.0, "completion_length": 261.9765625, "epoch": 1.63037109375, "grad_norm": 1.4254524637894548, "kl": 0.0645751953125, "learning_rate": 5.92529296875e-07, "loss": 0.0026, "reward": 1.7799670696258545, "reward_std": 0.02988600544631481, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7799670994281769, "step": 3338 }, { "clip_ratio": 0.0, "completion_length": 282.34375, "epoch": 1.630859375, "grad_norm": 0.9006275035038049, "kl": 0.0548095703125, "learning_rate": 5.924072265625e-07, "loss": 0.0022, "reward": 1.8364945650100708, "reward_std": 0.03155016852542758, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8364945650100708, "step": 3339 }, { "clip_ratio": 0.0, "completion_length": 287.203125, "epoch": 1.63134765625, "grad_norm": 1.9607130935646655, "kl": 0.080322265625, "learning_rate": 5.922851562499999e-07, "loss": 0.0032, "reward": 1.77052640914917, "reward_std": 0.06949007511138916, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7705264091491699, "step": 3340 }, { "clip_ratio": 0.0, "completion_length": 251.375, "epoch": 1.6318359375, "grad_norm": 4.07493900518628, "kl": 0.076416015625, "learning_rate": 5.921630859374999e-07, "loss": 0.003, "reward": 1.8116753101348877, "reward_std": 0.11965424194931984, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8116753697395325, "step": 3341 }, { "clip_ratio": 0.0, "completion_length": 290.09375, "epoch": 1.63232421875, "grad_norm": 3.3284633422339027, "kl": 0.06884765625, "learning_rate": 5.92041015625e-07, "loss": 0.0028, "reward": 1.7393649220466614, "reward_std": 0.11131243035197258, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7628024816513062, "step": 3342 }, { "clip_ratio": 0.0, "completion_length": 260.328125, "epoch": 1.6328125, "grad_norm": 1.6845731379939248, "kl": 0.076171875, "learning_rate": 5.919189453125e-07, "loss": 0.003, "reward": 1.7558993101119995, "reward_std": 0.03900916501879692, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7558992505073547, "step": 3343 }, { "clip_ratio": 0.0, "completion_length": 311.03125, "epoch": 1.63330078125, "grad_norm": 1.3128493226455236, "kl": 0.06005859375, "learning_rate": 5.91796875e-07, "loss": 0.0024, "reward": 1.6177734732627869, "reward_std": 0.0996141117066145, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6490235030651093, "step": 3344 }, { "clip_ratio": 0.0, "completion_length": 294.984375, "epoch": 1.6337890625, "grad_norm": 1.7117136465741267, "kl": 0.086669921875, "learning_rate": 5.916748046875e-07, "loss": 0.0035, "reward": 1.555152177810669, "reward_std": 0.10387120954692364, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.555152177810669, "step": 3345 }, { "clip_ratio": 0.0, "completion_length": 294.2265625, "epoch": 1.63427734375, "grad_norm": 1.4566254667516192, "kl": 0.072021484375, "learning_rate": 5.91552734375e-07, "loss": 0.0029, "reward": 1.7571306228637695, "reward_std": 0.05150624364614487, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7571305930614471, "step": 3346 }, { "clip_ratio": 0.0, "completion_length": 335.4140625, "epoch": 1.634765625, "grad_norm": 2.543802498479339, "kl": 0.082275390625, "learning_rate": 5.914306640624999e-07, "loss": 0.0033, "reward": 1.732638418674469, "reward_std": 0.11016843095421791, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7404508292675018, "step": 3347 }, { "clip_ratio": 0.0, "completion_length": 291.0078125, "epoch": 1.63525390625, "grad_norm": 7.467359428277764, "kl": 0.0714111328125, "learning_rate": 5.913085937499999e-07, "loss": 0.0029, "reward": 1.9553462266921997, "reward_std": 0.07758795842528343, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9553462266921997, "step": 3348 }, { "clip_ratio": 0.0, "completion_length": 266.03125, "epoch": 1.6357421875, "grad_norm": 1.1941702843950022, "kl": 0.058837890625, "learning_rate": 5.911865234375e-07, "loss": 0.0024, "reward": 1.6709920763969421, "reward_std": 0.05428230203688145, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6709920763969421, "step": 3349 }, { "clip_ratio": 0.0, "completion_length": 349.3125, "epoch": 1.63623046875, "grad_norm": 1.2102935214629125, "kl": 0.0623779296875, "learning_rate": 5.91064453125e-07, "loss": 0.0025, "reward": 1.8021827936172485, "reward_std": 0.03311594016849995, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8021828532218933, "step": 3350 }, { "clip_ratio": 0.0, "completion_length": 315.0546875, "epoch": 1.63671875, "grad_norm": 9.421608837634526, "kl": 0.08251953125, "learning_rate": 5.909423828125e-07, "loss": 0.0033, "reward": 1.809500515460968, "reward_std": 0.09658823721110821, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.809500515460968, "step": 3351 }, { "clip_ratio": 0.0, "completion_length": 285.3828125, "epoch": 1.63720703125, "grad_norm": 2.2569011847158373, "kl": 0.0986328125, "learning_rate": 5.908203125e-07, "loss": 0.0039, "reward": 1.721911609172821, "reward_std": 0.039531731978058815, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7219116389751434, "step": 3352 }, { "clip_ratio": 0.0, "completion_length": 315.0078125, "epoch": 1.6376953125, "grad_norm": 4.408187685160172, "kl": 0.0643310546875, "learning_rate": 5.906982421874999e-07, "loss": 0.0026, "reward": 1.8004092574119568, "reward_std": 0.10856766253709793, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.823846697807312, "step": 3353 }, { "clip_ratio": 0.0, "completion_length": 320.71875, "epoch": 1.63818359375, "grad_norm": 1.482450859345647, "kl": 0.0640869140625, "learning_rate": 5.905761718749999e-07, "loss": 0.0026, "reward": 1.8073206543922424, "reward_std": 0.09571165032684803, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8073206543922424, "step": 3354 }, { "clip_ratio": 0.0, "completion_length": 289.2890625, "epoch": 1.638671875, "grad_norm": 1.1455791471480024, "kl": 0.0482177734375, "learning_rate": 5.904541015625e-07, "loss": 0.0019, "reward": 1.8967827558517456, "reward_std": 0.06931715365499258, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.9045952558517456, "step": 3355 }, { "clip_ratio": 0.0, "completion_length": 345.8359375, "epoch": 1.63916015625, "grad_norm": 2.129032656060852, "kl": 0.068359375, "learning_rate": 5.9033203125e-07, "loss": 0.0027, "reward": 1.7991633415222168, "reward_std": 0.09995237179100513, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.814788281917572, "step": 3356 }, { "clip_ratio": 0.0, "completion_length": 343.140625, "epoch": 1.6396484375, "grad_norm": 2.146074812890639, "kl": 0.060302734375, "learning_rate": 5.902099609375e-07, "loss": 0.0024, "reward": 1.8512172102928162, "reward_std": 0.03298699017614126, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8512172102928162, "step": 3357 }, { "clip_ratio": 0.0, "completion_length": 314.2578125, "epoch": 1.64013671875, "grad_norm": 3.1698472003026805, "kl": 0.124755859375, "learning_rate": 5.90087890625e-07, "loss": 0.005, "reward": 1.7051687836647034, "reward_std": 0.05394227243959904, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7051687836647034, "step": 3358 }, { "clip_ratio": 0.0, "completion_length": 333.15625, "epoch": 1.640625, "grad_norm": 1.3803953318171671, "kl": 0.0633544921875, "learning_rate": 5.899658203125e-07, "loss": 0.0025, "reward": 1.8232309818267822, "reward_std": 0.1542208231985569, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8466684818267822, "step": 3359 }, { "clip_ratio": 0.0, "completion_length": 273.1171875, "epoch": 1.64111328125, "grad_norm": 1.7556288084155496, "kl": 0.084228515625, "learning_rate": 5.898437499999999e-07, "loss": 0.0034, "reward": 1.815483808517456, "reward_std": 0.07215754687786102, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.823296308517456, "step": 3360 }, { "clip_ratio": 0.0, "completion_length": 234.1796875, "epoch": 1.6416015625, "grad_norm": 0.8320346859160097, "kl": 0.074462890625, "learning_rate": 5.897216796874999e-07, "loss": 0.003, "reward": 1.7806763648986816, "reward_std": 0.0820821225643158, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.788488894701004, "step": 3361 }, { "clip_ratio": 0.0, "completion_length": 328.1796875, "epoch": 1.64208984375, "grad_norm": 1.6450666905858577, "kl": 0.094482421875, "learning_rate": 5.89599609375e-07, "loss": 0.0038, "reward": 1.786492109298706, "reward_std": 0.05385753884911537, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7864921987056732, "step": 3362 }, { "clip_ratio": 0.0, "completion_length": 242.4921875, "epoch": 1.642578125, "grad_norm": 1.0348454721040237, "kl": 0.08251953125, "learning_rate": 5.894775390625e-07, "loss": 0.0033, "reward": 1.7476333379745483, "reward_std": 0.061420466750860214, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.755445808172226, "step": 3363 }, { "clip_ratio": 0.0, "completion_length": 220.7578125, "epoch": 1.64306640625, "grad_norm": 1.2261159636339791, "kl": 0.0654296875, "learning_rate": 5.8935546875e-07, "loss": 0.0026, "reward": 1.8361621499061584, "reward_std": 0.11280067265033722, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8517871201038361, "step": 3364 }, { "clip_ratio": 0.0, "completion_length": 340.296875, "epoch": 1.6435546875, "grad_norm": 1.7651610785496405, "kl": 0.06787109375, "learning_rate": 5.892333984375e-07, "loss": 0.0027, "reward": 1.8495672345161438, "reward_std": 0.08414103463292122, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8495671451091766, "step": 3365 }, { "clip_ratio": 0.0, "completion_length": 253.8359375, "epoch": 1.64404296875, "grad_norm": 0.7490996025535948, "kl": 0.074951171875, "learning_rate": 5.891113281249999e-07, "loss": 0.003, "reward": 1.8907862901687622, "reward_std": 0.01694483682513237, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.890786349773407, "step": 3366 }, { "clip_ratio": 0.0, "completion_length": 251.5546875, "epoch": 1.64453125, "grad_norm": 0.7273295456840305, "kl": 0.08544921875, "learning_rate": 5.889892578124999e-07, "loss": 0.0034, "reward": 1.7350443005561829, "reward_std": 0.048879725858569145, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7350443005561829, "step": 3367 }, { "clip_ratio": 0.0, "completion_length": 286.21875, "epoch": 1.64501953125, "grad_norm": 7.736603835381099, "kl": 0.095947265625, "learning_rate": 5.888671875e-07, "loss": 0.0038, "reward": 1.761667251586914, "reward_std": 0.04048959631472826, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7616672217845917, "step": 3368 }, { "clip_ratio": 0.0, "completion_length": 330.4609375, "epoch": 1.6455078125, "grad_norm": 1.4724329943645882, "kl": 0.0556640625, "learning_rate": 5.887451171875e-07, "loss": 0.0022, "reward": 1.8380178213119507, "reward_std": 0.04197421669960022, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8380178809165955, "step": 3369 }, { "clip_ratio": 0.0, "completion_length": 245.5625, "epoch": 1.64599609375, "grad_norm": 1.4415743842970918, "kl": 0.07470703125, "learning_rate": 5.88623046875e-07, "loss": 0.003, "reward": 1.811439573764801, "reward_std": 0.06903266906738281, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8114396035671234, "step": 3370 }, { "clip_ratio": 0.0, "completion_length": 262.71875, "epoch": 1.646484375, "grad_norm": 1.5866478649672215, "kl": 0.072265625, "learning_rate": 5.885009765625e-07, "loss": 0.0029, "reward": 1.8194407224655151, "reward_std": 0.10099057853221893, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8194407522678375, "step": 3371 }, { "clip_ratio": 0.0, "completion_length": 262.9765625, "epoch": 1.64697265625, "grad_norm": 2.5414429477301455, "kl": 0.064697265625, "learning_rate": 5.8837890625e-07, "loss": 0.0026, "reward": 1.785763442516327, "reward_std": 0.029020313173532486, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7857634425163269, "step": 3372 }, { "clip_ratio": 0.0, "completion_length": 249.46875, "epoch": 1.6474609375, "grad_norm": 1.1864726934683525, "kl": 0.08349609375, "learning_rate": 5.882568359374999e-07, "loss": 0.0033, "reward": 1.7326732277870178, "reward_std": 0.045171596109867096, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7326732873916626, "step": 3373 }, { "clip_ratio": 0.0, "completion_length": 336.390625, "epoch": 1.64794921875, "grad_norm": 1.1727755992557876, "kl": 0.0673828125, "learning_rate": 5.881347656249999e-07, "loss": 0.0027, "reward": 1.9291696548461914, "reward_std": 0.05488063208758831, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.929169625043869, "step": 3374 }, { "clip_ratio": 0.0, "completion_length": 327.8203125, "epoch": 1.6484375, "grad_norm": 1.188974789867218, "kl": 0.07373046875, "learning_rate": 5.880126953125e-07, "loss": 0.003, "reward": 1.7475911974906921, "reward_std": 0.03924562409520149, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7475912272930145, "step": 3375 }, { "clip_ratio": 0.0, "completion_length": 261.7109375, "epoch": 1.64892578125, "grad_norm": 2.8855723689123254, "kl": 0.069580078125, "learning_rate": 5.87890625e-07, "loss": 0.0028, "reward": 1.8286888599395752, "reward_std": 0.05414394848048687, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8286888897418976, "step": 3376 }, { "clip_ratio": 0.0, "completion_length": 243.1171875, "epoch": 1.6494140625, "grad_norm": 1.348637874083575, "kl": 0.0787353515625, "learning_rate": 5.877685546875e-07, "loss": 0.0032, "reward": 1.7663710117340088, "reward_std": 0.03424928430467844, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7663710117340088, "step": 3377 }, { "clip_ratio": 0.0, "completion_length": 335.328125, "epoch": 1.64990234375, "grad_norm": 1.9070692767765587, "kl": 0.0706787109375, "learning_rate": 5.87646484375e-07, "loss": 0.0028, "reward": 1.792538821697235, "reward_std": 0.06923755258321762, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7925387620925903, "step": 3378 }, { "clip_ratio": 0.0, "completion_length": 201.46875, "epoch": 1.650390625, "grad_norm": 1.65866315067269, "kl": 0.08740234375, "learning_rate": 5.875244140625e-07, "loss": 0.0035, "reward": 1.7584347128868103, "reward_std": 0.0411848658695817, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7584347426891327, "step": 3379 }, { "clip_ratio": 0.0, "completion_length": 254.25, "epoch": 1.65087890625, "grad_norm": 2.196175009298372, "kl": 0.08349609375, "learning_rate": 5.874023437499999e-07, "loss": 0.0033, "reward": 1.621177852153778, "reward_std": 0.11486036516726017, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6289903223514557, "step": 3380 }, { "clip_ratio": 0.0, "completion_length": 203.328125, "epoch": 1.6513671875, "grad_norm": 1.6329994027998105, "kl": 0.088134765625, "learning_rate": 5.872802734375e-07, "loss": 0.0035, "reward": 1.76973557472229, "reward_std": 0.053606728091835976, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7697354555130005, "step": 3381 }, { "clip_ratio": 0.0, "completion_length": 312.5859375, "epoch": 1.65185546875, "grad_norm": 1.2486372473820984, "kl": 0.075927734375, "learning_rate": 5.87158203125e-07, "loss": 0.003, "reward": 1.9062875509262085, "reward_std": 0.02893537748605013, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9062875807285309, "step": 3382 }, { "clip_ratio": 0.0, "completion_length": 219.53125, "epoch": 1.65234375, "grad_norm": 1.5306926158806675, "kl": 0.081787109375, "learning_rate": 5.870361328125e-07, "loss": 0.0033, "reward": 1.7200778126716614, "reward_std": 0.053463514894247055, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.720077782869339, "step": 3383 }, { "clip_ratio": 0.0, "completion_length": 255.15625, "epoch": 1.65283203125, "grad_norm": 1.9605699591720807, "kl": 0.086181640625, "learning_rate": 5.869140625e-07, "loss": 0.0034, "reward": 1.7785995602607727, "reward_std": 0.05440284963697195, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7785995900630951, "step": 3384 }, { "clip_ratio": 0.0, "completion_length": 293.3671875, "epoch": 1.6533203125, "grad_norm": 1.1186012819207778, "kl": 0.08154296875, "learning_rate": 5.867919921875e-07, "loss": 0.0033, "reward": 1.7977607250213623, "reward_std": 0.04878430813550949, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7977607250213623, "step": 3385 }, { "clip_ratio": 0.0, "completion_length": 277.8125, "epoch": 1.65380859375, "grad_norm": 1.4994976766626942, "kl": 0.0703125, "learning_rate": 5.866699218749999e-07, "loss": 0.0028, "reward": 1.7266179919242859, "reward_std": 0.044663604348897934, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7266179323196411, "step": 3386 }, { "clip_ratio": 0.0, "completion_length": 298.7421875, "epoch": 1.654296875, "grad_norm": 2.908450555481079, "kl": 0.093017578125, "learning_rate": 5.865478515624999e-07, "loss": 0.0037, "reward": 1.7329715490341187, "reward_std": 0.1744391992688179, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7485965490341187, "step": 3387 }, { "clip_ratio": 0.0, "completion_length": 253.203125, "epoch": 1.65478515625, "grad_norm": 1.221054748601646, "kl": 0.102294921875, "learning_rate": 5.8642578125e-07, "loss": 0.0041, "reward": 1.7703983783721924, "reward_std": 0.06643011048436165, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7703984379768372, "step": 3388 }, { "clip_ratio": 0.0, "completion_length": 241.8671875, "epoch": 1.6552734375, "grad_norm": 6.847886742373634, "kl": 0.093017578125, "learning_rate": 5.863037109375e-07, "loss": 0.0037, "reward": 1.8793238401412964, "reward_std": 0.09725763648748398, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8793238401412964, "step": 3389 }, { "clip_ratio": 0.0, "completion_length": 316.265625, "epoch": 1.65576171875, "grad_norm": 1.6334940052830407, "kl": 0.07373046875, "learning_rate": 5.86181640625e-07, "loss": 0.0029, "reward": 1.754858374595642, "reward_std": 0.13046734035015106, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7626709043979645, "step": 3390 }, { "clip_ratio": 0.0, "completion_length": 385.203125, "epoch": 1.65625, "grad_norm": 2.4516828308421816, "kl": 0.0673828125, "learning_rate": 5.860595703125e-07, "loss": 0.0027, "reward": 1.6885485649108887, "reward_std": 0.12255653738975525, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6963610649108887, "step": 3391 }, { "clip_ratio": 0.0, "completion_length": 279.9453125, "epoch": 1.65673828125, "grad_norm": 1.0973530080385832, "kl": 0.06591796875, "learning_rate": 5.859375e-07, "loss": 0.0026, "reward": 1.7706849575042725, "reward_std": 0.08537603169679642, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.8097475171089172, "step": 3392 }, { "clip_ratio": 0.0, "completion_length": 238.03125, "epoch": 1.6572265625, "grad_norm": 1.3434826786542489, "kl": 0.08056640625, "learning_rate": 5.858154296874999e-07, "loss": 0.0032, "reward": 1.769907832145691, "reward_std": 0.02592490427196026, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7699077725410461, "step": 3393 }, { "clip_ratio": 0.0, "completion_length": 375.578125, "epoch": 1.65771484375, "grad_norm": 1.8016472051718078, "kl": 0.091552734375, "learning_rate": 5.85693359375e-07, "loss": 0.0037, "reward": 1.7779169082641602, "reward_std": 0.08736255019903183, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7779169082641602, "step": 3394 }, { "clip_ratio": 0.0, "completion_length": 255.875, "epoch": 1.658203125, "grad_norm": 2.5785595078259833, "kl": 0.080322265625, "learning_rate": 5.855712890625e-07, "loss": 0.0032, "reward": 1.7090917825698853, "reward_std": 0.027815474197268486, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7090917825698853, "step": 3395 }, { "clip_ratio": 0.0, "completion_length": 248.3828125, "epoch": 1.65869140625, "grad_norm": 1.007748533466306, "kl": 0.078369140625, "learning_rate": 5.8544921875e-07, "loss": 0.0031, "reward": 1.722628891468048, "reward_std": 0.026624855119735003, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7226289510726929, "step": 3396 }, { "clip_ratio": 0.0, "completion_length": 311.3359375, "epoch": 1.6591796875, "grad_norm": 0.6947502588037848, "kl": 0.06640625, "learning_rate": 5.853271484375e-07, "loss": 0.0027, "reward": 1.8273064494132996, "reward_std": 0.02322842739522457, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8273063898086548, "step": 3397 }, { "clip_ratio": 0.0, "completion_length": 279.484375, "epoch": 1.65966796875, "grad_norm": 1.357728234815542, "kl": 0.0673828125, "learning_rate": 5.85205078125e-07, "loss": 0.0027, "reward": 1.777342975139618, "reward_std": 0.0818490230012685, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7929678857326508, "step": 3398 }, { "clip_ratio": 0.0, "completion_length": 343.1640625, "epoch": 1.66015625, "grad_norm": 1.5073163826811782, "kl": 0.088623046875, "learning_rate": 5.850830078124999e-07, "loss": 0.0035, "reward": 1.8505135774612427, "reward_std": 0.036812907084822655, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8505135774612427, "step": 3399 }, { "clip_ratio": 0.0, "completion_length": 336.6640625, "epoch": 1.66064453125, "grad_norm": 0.5187163531397792, "kl": 0.05615234375, "learning_rate": 5.849609374999999e-07, "loss": 0.0022, "reward": 1.7177002429962158, "reward_std": 0.039677318185567856, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7177002131938934, "step": 3400 }, { "clip_ratio": 0.0, "completion_length": 447.9765625, "epoch": 1.6611328125, "grad_norm": 0.7388474136721533, "kl": 0.0703125, "learning_rate": 5.848388671875e-07, "loss": 0.0028, "reward": 1.793430507183075, "reward_std": 0.12719424441456795, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.816868007183075, "step": 3401 }, { "clip_ratio": 0.0, "completion_length": 353.7578125, "epoch": 1.66162109375, "grad_norm": 0.9275778304188466, "kl": 0.0556640625, "learning_rate": 5.84716796875e-07, "loss": 0.0022, "reward": 1.8390734195709229, "reward_std": 0.07613059133291245, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8468858897686005, "step": 3402 }, { "clip_ratio": 0.0, "completion_length": 255.3359375, "epoch": 1.662109375, "grad_norm": 1.1797462013382214, "kl": 0.0712890625, "learning_rate": 5.845947265625e-07, "loss": 0.0029, "reward": 1.7845313549041748, "reward_std": 0.056141434237360954, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.78453129529953, "step": 3403 }, { "clip_ratio": 0.0, "completion_length": 255.3046875, "epoch": 1.66259765625, "grad_norm": 1.3006085053369527, "kl": 0.074951171875, "learning_rate": 5.8447265625e-07, "loss": 0.003, "reward": 1.7825297117233276, "reward_std": 0.07007915712893009, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7903422117233276, "step": 3404 }, { "clip_ratio": 0.0, "completion_length": 222.7734375, "epoch": 1.6630859375, "grad_norm": 1.0770527330394377, "kl": 0.0595703125, "learning_rate": 5.843505859375e-07, "loss": 0.0024, "reward": 1.9472978711128235, "reward_std": 0.053828125819563866, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9472978115081787, "step": 3405 }, { "clip_ratio": 0.0, "completion_length": 314.84375, "epoch": 1.66357421875, "grad_norm": 0.940289877722008, "kl": 0.0732421875, "learning_rate": 5.842285156249999e-07, "loss": 0.0029, "reward": 1.5709947347640991, "reward_std": 0.10499111982062459, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.5944323241710663, "step": 3406 }, { "clip_ratio": 0.0, "completion_length": 203.3203125, "epoch": 1.6640625, "grad_norm": 1.5937163992276968, "kl": 0.0614013671875, "learning_rate": 5.841064453125e-07, "loss": 0.0025, "reward": 1.8672499656677246, "reward_std": 0.04773255158215761, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8672499656677246, "step": 3407 }, { "clip_ratio": 0.0, "completion_length": 315.6875, "epoch": 1.66455078125, "grad_norm": 1.0772380721292933, "kl": 0.063232421875, "learning_rate": 5.83984375e-07, "loss": 0.0025, "reward": 1.7862460613250732, "reward_std": 0.03994133323431015, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7862460613250732, "step": 3408 }, { "clip_ratio": 0.0, "completion_length": 306.3515625, "epoch": 1.6650390625, "grad_norm": 2.3302586200745905, "kl": 0.072021484375, "learning_rate": 5.838623046875e-07, "loss": 0.0029, "reward": 1.8306609988212585, "reward_std": 0.06606091558933258, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8306609988212585, "step": 3409 }, { "clip_ratio": 0.0, "completion_length": 225.609375, "epoch": 1.66552734375, "grad_norm": 1.5068422166925504, "kl": 0.087158203125, "learning_rate": 5.83740234375e-07, "loss": 0.0035, "reward": 1.7843865156173706, "reward_std": 0.03198308777064085, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.784386545419693, "step": 3410 }, { "clip_ratio": 0.0, "completion_length": 237.96875, "epoch": 1.666015625, "grad_norm": 3.0485685011397536, "kl": 0.06689453125, "learning_rate": 5.836181640625e-07, "loss": 0.0027, "reward": 1.8677841424942017, "reward_std": 0.04165232554078102, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8677841424942017, "step": 3411 }, { "clip_ratio": 0.0, "completion_length": 303.03125, "epoch": 1.66650390625, "grad_norm": 1.5582087332286472, "kl": 0.0908203125, "learning_rate": 5.834960937499999e-07, "loss": 0.0036, "reward": 1.750693678855896, "reward_std": 0.14407047256827354, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7741312682628632, "step": 3412 }, { "clip_ratio": 0.0, "completion_length": 251.6875, "epoch": 1.6669921875, "grad_norm": 1.3780651239867965, "kl": 0.0771484375, "learning_rate": 5.833740234374999e-07, "loss": 0.0031, "reward": 1.774366855621338, "reward_std": 0.03671616315841675, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7743669152259827, "step": 3413 }, { "clip_ratio": 0.0, "completion_length": 289.7734375, "epoch": 1.66748046875, "grad_norm": 2.4401285404057287, "kl": 0.09814453125, "learning_rate": 5.83251953125e-07, "loss": 0.0039, "reward": 1.7291316986083984, "reward_std": 0.06025635078549385, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7291316986083984, "step": 3414 }, { "clip_ratio": 0.0, "completion_length": 217.96875, "epoch": 1.66796875, "grad_norm": 0.9632974422175405, "kl": 0.069091796875, "learning_rate": 5.831298828125e-07, "loss": 0.0028, "reward": 1.7597804069519043, "reward_std": 0.04982480686157942, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7597803771495819, "step": 3415 }, { "clip_ratio": 0.0, "completion_length": 261.0, "epoch": 1.66845703125, "grad_norm": 2.137948451017365, "kl": 0.078857421875, "learning_rate": 5.830078125e-07, "loss": 0.0032, "reward": 1.7445058226585388, "reward_std": 0.05879105068743229, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7445058226585388, "step": 3416 }, { "clip_ratio": 0.0, "completion_length": 246.1953125, "epoch": 1.6689453125, "grad_norm": 0.916829410323994, "kl": 0.0849609375, "learning_rate": 5.828857421875e-07, "loss": 0.0034, "reward": 1.767207384109497, "reward_std": 0.021022816188633442, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7672074139118195, "step": 3417 }, { "clip_ratio": 0.0, "completion_length": 352.265625, "epoch": 1.66943359375, "grad_norm": 1.8069522939932925, "kl": 0.076171875, "learning_rate": 5.82763671875e-07, "loss": 0.003, "reward": 1.7938191294670105, "reward_std": 0.06862248852849007, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7938191592693329, "step": 3418 }, { "clip_ratio": 0.0, "completion_length": 228.125, "epoch": 1.669921875, "grad_norm": 1.5981682786493596, "kl": 0.06591796875, "learning_rate": 5.826416015624999e-07, "loss": 0.0026, "reward": 1.8110138773918152, "reward_std": 0.017725080251693726, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8110139071941376, "step": 3419 }, { "clip_ratio": 0.0, "completion_length": 239.6171875, "epoch": 1.67041015625, "grad_norm": 2.2101743301776504, "kl": 0.098388671875, "learning_rate": 5.8251953125e-07, "loss": 0.0039, "reward": 1.7247655987739563, "reward_std": 0.06738665699958801, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7247655987739563, "step": 3420 }, { "clip_ratio": 0.0, "completion_length": 345.0703125, "epoch": 1.6708984375, "grad_norm": 5.321656124557536, "kl": 0.068115234375, "learning_rate": 5.823974609375e-07, "loss": 0.0027, "reward": 1.7655808925628662, "reward_std": 0.06156047061085701, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7655808329582214, "step": 3421 }, { "clip_ratio": 0.0, "completion_length": 203.8984375, "epoch": 1.67138671875, "grad_norm": 2.8203241658105247, "kl": 0.082763671875, "learning_rate": 5.82275390625e-07, "loss": 0.0033, "reward": 2.0598042607307434, "reward_std": 0.057430900633335114, "rewards/format_reward": 1.0, "rewards/ocr_reward": 1.0598042011260986, "step": 3422 }, { "clip_ratio": 0.0, "completion_length": 268.484375, "epoch": 1.671875, "grad_norm": 1.8127244648228247, "kl": 0.07421875, "learning_rate": 5.821533203125e-07, "loss": 0.003, "reward": 1.7353619933128357, "reward_std": 0.11851292103528976, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7509869635105133, "step": 3423 }, { "clip_ratio": 0.0, "completion_length": 300.9375, "epoch": 1.67236328125, "grad_norm": 4.191171905834963, "kl": 0.0677490234375, "learning_rate": 5.8203125e-07, "loss": 0.0027, "reward": 1.8721721768379211, "reward_std": 0.03585383854806423, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8721721768379211, "step": 3424 }, { "clip_ratio": 0.0, "completion_length": 222.265625, "epoch": 1.6728515625, "grad_norm": 3.1746721322821356, "kl": 0.077392578125, "learning_rate": 5.819091796874999e-07, "loss": 0.0031, "reward": 1.7025277614593506, "reward_std": 0.07956914976239204, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7025277316570282, "step": 3425 }, { "clip_ratio": 0.0, "completion_length": 298.7890625, "epoch": 1.67333984375, "grad_norm": 0.934350590497789, "kl": 0.063720703125, "learning_rate": 5.817871093749999e-07, "loss": 0.0025, "reward": 1.8146753311157227, "reward_std": 0.03896358422935009, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8146752715110779, "step": 3426 }, { "clip_ratio": 0.0, "completion_length": 382.8359375, "epoch": 1.673828125, "grad_norm": 2.2976808756968197, "kl": 0.075439453125, "learning_rate": 5.816650390625e-07, "loss": 0.003, "reward": 1.7204577922821045, "reward_std": 0.10786120407283306, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7204578518867493, "step": 3427 }, { "clip_ratio": 0.0, "completion_length": 195.703125, "epoch": 1.67431640625, "grad_norm": 5.209395887612558, "kl": 0.0859375, "learning_rate": 5.8154296875e-07, "loss": 0.0034, "reward": 1.7612760663032532, "reward_std": 0.14266540855169296, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7612760663032532, "step": 3428 }, { "clip_ratio": 0.0, "completion_length": 313.4921875, "epoch": 1.6748046875, "grad_norm": 2.974078254850948, "kl": 0.0771484375, "learning_rate": 5.814208984375e-07, "loss": 0.0031, "reward": 1.7587260007858276, "reward_std": 0.045107051730155945, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.75872603058815, "step": 3429 }, { "clip_ratio": 0.0, "completion_length": 388.3671875, "epoch": 1.67529296875, "grad_norm": 2.1974943162800957, "kl": 0.062744140625, "learning_rate": 5.81298828125e-07, "loss": 0.0025, "reward": 1.7304607629776, "reward_std": 0.07229996286332607, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7304607033729553, "step": 3430 }, { "clip_ratio": 0.0, "completion_length": 289.3515625, "epoch": 1.67578125, "grad_norm": 0.8396091238176443, "kl": 0.0738525390625, "learning_rate": 5.811767578125e-07, "loss": 0.003, "reward": 1.747983455657959, "reward_std": 0.02914267312735319, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7479834854602814, "step": 3431 }, { "clip_ratio": 0.0, "completion_length": 278.1640625, "epoch": 1.67626953125, "grad_norm": 2.6575284914917816, "kl": 0.069580078125, "learning_rate": 5.810546874999999e-07, "loss": 0.0028, "reward": 1.8400204181671143, "reward_std": 0.04621163569390774, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8400204181671143, "step": 3432 }, { "clip_ratio": 0.0, "completion_length": 213.296875, "epoch": 1.6767578125, "grad_norm": 2.4310818517899753, "kl": 0.0673828125, "learning_rate": 5.809326171875e-07, "loss": 0.0027, "reward": 1.8287239074707031, "reward_std": 0.04241657070815563, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8287239372730255, "step": 3433 }, { "clip_ratio": 0.0, "completion_length": 243.03125, "epoch": 1.67724609375, "grad_norm": 1.4441990565870806, "kl": 0.0732421875, "learning_rate": 5.80810546875e-07, "loss": 0.0029, "reward": 1.8703618049621582, "reward_std": 0.09191784635186195, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8859868347644806, "step": 3434 }, { "clip_ratio": 0.0, "completion_length": 310.84375, "epoch": 1.677734375, "grad_norm": 4.354556191413624, "kl": 0.070068359375, "learning_rate": 5.806884765625e-07, "loss": 0.0028, "reward": 1.6289713382720947, "reward_std": 0.10028214752674103, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6367838084697723, "step": 3435 }, { "clip_ratio": 0.0, "completion_length": 338.3515625, "epoch": 1.67822265625, "grad_norm": 3.14226423674001, "kl": 0.063720703125, "learning_rate": 5.8056640625e-07, "loss": 0.0025, "reward": 1.8268967866897583, "reward_std": 0.1756245121359825, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8503343164920807, "step": 3436 }, { "clip_ratio": 0.0, "completion_length": 276.421875, "epoch": 1.6787109375, "grad_norm": 1.6468385086214168, "kl": 0.0556640625, "learning_rate": 5.804443359375e-07, "loss": 0.0022, "reward": 1.8361674547195435, "reward_std": 0.07743523456156254, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8439799845218658, "step": 3437 }, { "clip_ratio": 0.0, "completion_length": 298.828125, "epoch": 1.67919921875, "grad_norm": 1.236516691039877, "kl": 0.08056640625, "learning_rate": 5.803222656249999e-07, "loss": 0.0032, "reward": 1.8144738674163818, "reward_std": 0.03578588366508484, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8144738674163818, "step": 3438 }, { "clip_ratio": 0.0, "completion_length": 295.125, "epoch": 1.6796875, "grad_norm": 7.342318015197428, "kl": 0.0582275390625, "learning_rate": 5.802001953124999e-07, "loss": 0.0023, "reward": 1.7566466927528381, "reward_std": 0.057324403896927834, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7566466629505157, "step": 3439 }, { "clip_ratio": 0.0, "completion_length": 273.5, "epoch": 1.68017578125, "grad_norm": 1.5126156935058757, "kl": 0.056884765625, "learning_rate": 5.80078125e-07, "loss": 0.0023, "reward": 1.8134649991989136, "reward_std": 0.07411767356097698, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8212774991989136, "step": 3440 }, { "clip_ratio": 0.0, "completion_length": 336.8984375, "epoch": 1.6806640625, "grad_norm": 2.8029832705522795, "kl": 0.0721435546875, "learning_rate": 5.799560546875e-07, "loss": 0.0029, "reward": 1.7705180048942566, "reward_std": 0.14411171525716782, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.786143034696579, "step": 3441 }, { "clip_ratio": 0.0, "completion_length": 260.0546875, "epoch": 1.68115234375, "grad_norm": 2.075903401049284, "kl": 0.060791015625, "learning_rate": 5.79833984375e-07, "loss": 0.0024, "reward": 1.7683696746826172, "reward_std": 0.06466953456401825, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7683696448802948, "step": 3442 }, { "clip_ratio": 0.0, "completion_length": 281.984375, "epoch": 1.681640625, "grad_norm": 1.6509715387223451, "kl": 0.086181640625, "learning_rate": 5.797119140625e-07, "loss": 0.0035, "reward": 1.7627912759780884, "reward_std": 0.034193447791039944, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7627912759780884, "step": 3443 }, { "clip_ratio": 0.0, "completion_length": 370.921875, "epoch": 1.68212890625, "grad_norm": 1.1378399236168264, "kl": 0.0523681640625, "learning_rate": 5.7958984375e-07, "loss": 0.0021, "reward": 1.7294188141822815, "reward_std": 0.13451597094535828, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7372312545776367, "step": 3444 }, { "clip_ratio": 0.0, "completion_length": 257.703125, "epoch": 1.6826171875, "grad_norm": 5.502362650100543, "kl": 0.0650634765625, "learning_rate": 5.794677734374999e-07, "loss": 0.0026, "reward": 1.8440684080123901, "reward_std": 0.08081773668527603, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8440684378147125, "step": 3445 }, { "clip_ratio": 0.0, "completion_length": 298.9375, "epoch": 1.68310546875, "grad_norm": 1.2259288505339647, "kl": 0.0706787109375, "learning_rate": 5.79345703125e-07, "loss": 0.0028, "reward": 1.7710025310516357, "reward_std": 0.12468947097659111, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7866275310516357, "step": 3446 }, { "clip_ratio": 0.0, "completion_length": 273.9921875, "epoch": 1.68359375, "grad_norm": 1.4159167368823977, "kl": 0.0728759765625, "learning_rate": 5.792236328125e-07, "loss": 0.0029, "reward": 1.782673954963684, "reward_std": 0.025566712021827698, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7826739549636841, "step": 3447 }, { "clip_ratio": 0.0, "completion_length": 295.953125, "epoch": 1.68408203125, "grad_norm": 1.2888654953284708, "kl": 0.057373046875, "learning_rate": 5.791015625e-07, "loss": 0.0023, "reward": 1.7656115293502808, "reward_std": 0.07896413654088974, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7812364995479584, "step": 3448 }, { "clip_ratio": 0.0, "completion_length": 269.4453125, "epoch": 1.6845703125, "grad_norm": 0.8051007000694611, "kl": 0.069091796875, "learning_rate": 5.789794921875e-07, "loss": 0.0028, "reward": 1.8307116031646729, "reward_std": 0.03494404815137386, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8307116329669952, "step": 3449 }, { "clip_ratio": 0.0, "completion_length": 340.03125, "epoch": 1.68505859375, "grad_norm": 1.6248548960001123, "kl": 0.057373046875, "learning_rate": 5.78857421875e-07, "loss": 0.0023, "reward": 1.6238124370574951, "reward_std": 0.10186551045626402, "rewards/format_reward": 0.96875, "rewards/ocr_reward": 0.6550624966621399, "step": 3450 }, { "clip_ratio": 0.0, "completion_length": 345.6875, "epoch": 1.685546875, "grad_norm": 1.8883437681183317, "kl": 0.0675048828125, "learning_rate": 5.787353515624999e-07, "loss": 0.0027, "reward": 1.7342004776000977, "reward_std": 0.11742651090025902, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7498254776000977, "step": 3451 }, { "clip_ratio": 0.0, "completion_length": 262.34375, "epoch": 1.68603515625, "grad_norm": 0.854071632997255, "kl": 0.06298828125, "learning_rate": 5.786132812499999e-07, "loss": 0.0025, "reward": 1.8311820030212402, "reward_std": 0.05601404421031475, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.831182062625885, "step": 3452 }, { "clip_ratio": 0.0, "completion_length": 314.0546875, "epoch": 1.6865234375, "grad_norm": 0.4515732246038297, "kl": 0.0604248046875, "learning_rate": 5.784912109375e-07, "loss": 0.0024, "reward": 1.7703008651733398, "reward_std": 0.07104413863271475, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7859258651733398, "step": 3453 }, { "clip_ratio": 0.0, "completion_length": 284.140625, "epoch": 1.68701171875, "grad_norm": 2.068456057453586, "kl": 0.08349609375, "learning_rate": 5.78369140625e-07, "loss": 0.0033, "reward": 1.7976149916648865, "reward_std": 0.04534151777625084, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7976149916648865, "step": 3454 }, { "clip_ratio": 0.0, "completion_length": 409.375, "epoch": 1.6875, "grad_norm": 1.7355862712462304, "kl": 0.0626220703125, "learning_rate": 5.782470703125e-07, "loss": 0.0025, "reward": 1.8032622337341309, "reward_std": 0.053254470229148865, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8032622635364532, "step": 3455 }, { "clip_ratio": 0.0, "completion_length": 317.1328125, "epoch": 1.68798828125, "grad_norm": 2.871369071419689, "kl": 0.0594482421875, "learning_rate": 5.78125e-07, "loss": 0.0024, "reward": 1.8522000908851624, "reward_std": 0.043344199657440186, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8522000908851624, "step": 3456 }, { "clip_ratio": 0.0, "completion_length": 328.984375, "epoch": 1.6884765625, "grad_norm": 1.0797110346192287, "kl": 0.066650390625, "learning_rate": 5.780029296875e-07, "loss": 0.0027, "reward": 1.718557059764862, "reward_std": 0.09980412572622299, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7341820597648621, "step": 3457 }, { "clip_ratio": 0.0, "completion_length": 329.3046875, "epoch": 1.68896484375, "grad_norm": 2.010061312368978, "kl": 0.071533203125, "learning_rate": 5.778808593749999e-07, "loss": 0.0029, "reward": 1.7197965383529663, "reward_std": 0.1209321841597557, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.7588590979576111, "step": 3458 }, { "clip_ratio": 0.0, "completion_length": 300.7265625, "epoch": 1.689453125, "grad_norm": 1.7529446065841696, "kl": 0.0675048828125, "learning_rate": 5.777587890624999e-07, "loss": 0.0027, "reward": 1.6905794739723206, "reward_std": 0.13847313076257706, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7062044143676758, "step": 3459 }, { "clip_ratio": 0.0, "completion_length": 314.6796875, "epoch": 1.68994140625, "grad_norm": 2.0487912470077925, "kl": 0.0673828125, "learning_rate": 5.7763671875e-07, "loss": 0.0027, "reward": 1.770250141620636, "reward_std": 0.027558826841413975, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.770250141620636, "step": 3460 }, { "clip_ratio": 0.0, "completion_length": 298.375, "epoch": 1.6904296875, "grad_norm": 1.607076612787755, "kl": 0.0579833984375, "learning_rate": 5.775146484375e-07, "loss": 0.0023, "reward": 1.7800695300102234, "reward_std": 0.07028440106660128, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7878820598125458, "step": 3461 }, { "clip_ratio": 0.0, "completion_length": 257.96875, "epoch": 1.69091796875, "grad_norm": 3.802817161929801, "kl": 0.064453125, "learning_rate": 5.77392578125e-07, "loss": 0.0026, "reward": 1.7009736895561218, "reward_std": 0.033372608944773674, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7009736895561218, "step": 3462 }, { "clip_ratio": 0.0, "completion_length": 359.640625, "epoch": 1.69140625, "grad_norm": 0.8747808849685156, "kl": 0.0589599609375, "learning_rate": 5.772705078125e-07, "loss": 0.0024, "reward": 1.7267315983772278, "reward_std": 0.12720267474651337, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7423565685749054, "step": 3463 }, { "clip_ratio": 0.0, "completion_length": 294.4296875, "epoch": 1.69189453125, "grad_norm": 2.0193595822393293, "kl": 0.0712890625, "learning_rate": 5.771484374999999e-07, "loss": 0.0029, "reward": 1.708345651626587, "reward_std": 0.03282461129128933, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7083457112312317, "step": 3464 }, { "clip_ratio": 0.0, "completion_length": 306.96875, "epoch": 1.6923828125, "grad_norm": 1.7772356569257048, "kl": 0.0599365234375, "learning_rate": 5.770263671874999e-07, "loss": 0.0024, "reward": 1.7624672055244446, "reward_std": 0.11228394508361816, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7624672055244446, "step": 3465 }, { "clip_ratio": 0.0, "completion_length": 247.515625, "epoch": 1.69287109375, "grad_norm": 5.082897937319996, "kl": 0.084716796875, "learning_rate": 5.76904296875e-07, "loss": 0.0034, "reward": 1.7134467959403992, "reward_std": 0.12474965304136276, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7134467661380768, "step": 3466 }, { "clip_ratio": 0.0, "completion_length": 331.734375, "epoch": 1.693359375, "grad_norm": 7.706459266928283, "kl": 0.07421875, "learning_rate": 5.767822265625e-07, "loss": 0.003, "reward": 1.8310195803642273, "reward_std": 0.04999265819787979, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8310195505619049, "step": 3467 }, { "clip_ratio": 0.0, "completion_length": 293.515625, "epoch": 1.69384765625, "grad_norm": 1.7616788714824219, "kl": 0.09423828125, "learning_rate": 5.7666015625e-07, "loss": 0.0038, "reward": 1.7039409279823303, "reward_std": 0.12659362703561783, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7117533683776855, "step": 3468 }, { "clip_ratio": 0.0, "completion_length": 315.171875, "epoch": 1.6943359375, "grad_norm": 1.3000730711438913, "kl": 0.07666015625, "learning_rate": 5.765380859375e-07, "loss": 0.0031, "reward": 1.7594356536865234, "reward_std": 0.02355903387069702, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.759435623884201, "step": 3469 }, { "clip_ratio": 0.0, "completion_length": 242.6796875, "epoch": 1.69482421875, "grad_norm": 1.5447040369180085, "kl": 0.0908203125, "learning_rate": 5.76416015625e-07, "loss": 0.0036, "reward": 1.7440487742424011, "reward_std": 0.09019343182444572, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7440488040447235, "step": 3470 }, { "clip_ratio": 0.0, "completion_length": 307.65625, "epoch": 1.6953125, "grad_norm": 1.207797652928656, "kl": 0.0830078125, "learning_rate": 5.762939453124999e-07, "loss": 0.0033, "reward": 1.6612102389335632, "reward_std": 0.023191725835204124, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6612102389335632, "step": 3471 }, { "clip_ratio": 0.0, "completion_length": 287.8359375, "epoch": 1.69580078125, "grad_norm": 1.7416107299485089, "kl": 0.060791015625, "learning_rate": 5.761718749999999e-07, "loss": 0.0024, "reward": 1.8417921662330627, "reward_std": 0.034474316984415054, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.841792106628418, "step": 3472 }, { "clip_ratio": 0.0, "completion_length": 289.234375, "epoch": 1.6962890625, "grad_norm": 1.4354809097592818, "kl": 0.0771484375, "learning_rate": 5.760498046875e-07, "loss": 0.0031, "reward": 1.5817983150482178, "reward_std": 0.12967222556471825, "rewards/format_reward": 0.9609375, "rewards/ocr_reward": 0.6208608150482178, "step": 3473 }, { "clip_ratio": 0.0, "completion_length": 398.0, "epoch": 1.69677734375, "grad_norm": 3.4907537874766117, "kl": 0.0516357421875, "learning_rate": 5.75927734375e-07, "loss": 0.0021, "reward": 1.85581374168396, "reward_std": 0.04456772096455097, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8558137714862823, "step": 3474 }, { "clip_ratio": 0.0, "completion_length": 300.53125, "epoch": 1.697265625, "grad_norm": 0.9069326314470321, "kl": 0.064697265625, "learning_rate": 5.758056640625e-07, "loss": 0.0026, "reward": 1.777282476425171, "reward_std": 0.07658272795379162, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7772825360298157, "step": 3475 }, { "clip_ratio": 0.0, "completion_length": 280.0625, "epoch": 1.69775390625, "grad_norm": 4.59511333562533, "kl": 0.0643310546875, "learning_rate": 5.7568359375e-07, "loss": 0.0026, "reward": 1.7993093729019165, "reward_std": 0.06882397923618555, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7993092834949493, "step": 3476 }, { "clip_ratio": 0.0, "completion_length": 283.0078125, "epoch": 1.6982421875, "grad_norm": 1.3731053289621549, "kl": 0.071533203125, "learning_rate": 5.755615234375e-07, "loss": 0.0029, "reward": 1.8083871006965637, "reward_std": 0.0703160697594285, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8083871304988861, "step": 3477 }, { "clip_ratio": 0.0, "completion_length": 305.2734375, "epoch": 1.69873046875, "grad_norm": 1.629812946645828, "kl": 0.0693359375, "learning_rate": 5.754394531249999e-07, "loss": 0.0028, "reward": 1.7725472450256348, "reward_std": 0.017249885015189648, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7725472748279572, "step": 3478 }, { "clip_ratio": 0.0, "completion_length": 280.375, "epoch": 1.69921875, "grad_norm": 2.215999100156812, "kl": 0.0732421875, "learning_rate": 5.753173828125e-07, "loss": 0.0029, "reward": 1.7830110788345337, "reward_std": 0.05324237793684006, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7830111086368561, "step": 3479 }, { "clip_ratio": 0.0, "completion_length": 290.8046875, "epoch": 1.69970703125, "grad_norm": 1.6611927604495773, "kl": 0.0601806640625, "learning_rate": 5.751953125e-07, "loss": 0.0024, "reward": 1.7691839337348938, "reward_std": 0.042867109179496765, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7691839039325714, "step": 3480 }, { "clip_ratio": 0.0, "completion_length": 292.765625, "epoch": 1.7001953125, "grad_norm": 1.528033156409457, "kl": 0.068115234375, "learning_rate": 5.750732421875e-07, "loss": 0.0027, "reward": 1.7416942119598389, "reward_std": 0.07591928541660309, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7416941821575165, "step": 3481 }, { "clip_ratio": 0.0, "completion_length": 330.1640625, "epoch": 1.70068359375, "grad_norm": 1.1833895285281955, "kl": 0.06005859375, "learning_rate": 5.74951171875e-07, "loss": 0.0024, "reward": 1.7779241800308228, "reward_std": 0.1391547992825508, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.8013616800308228, "step": 3482 }, { "clip_ratio": 0.0, "completion_length": 314.90625, "epoch": 1.701171875, "grad_norm": 1.761189142899964, "kl": 0.0662841796875, "learning_rate": 5.748291015625e-07, "loss": 0.0027, "reward": 1.8467693328857422, "reward_std": 0.06352511048316956, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8467693328857422, "step": 3483 }, { "clip_ratio": 0.0, "completion_length": 296.2890625, "epoch": 1.70166015625, "grad_norm": 1.5902592871580774, "kl": 0.08154296875, "learning_rate": 5.747070312499999e-07, "loss": 0.0033, "reward": 1.7309446930885315, "reward_std": 0.07680136896669865, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7465696930885315, "step": 3484 }, { "clip_ratio": 0.0, "completion_length": 363.6953125, "epoch": 1.7021484375, "grad_norm": 10.920362143170516, "kl": 0.0516357421875, "learning_rate": 5.745849609374999e-07, "loss": 0.0021, "reward": 1.6752318739891052, "reward_std": 0.09153604693710804, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6752318441867828, "step": 3485 }, { "clip_ratio": 0.0, "completion_length": 254.6640625, "epoch": 1.70263671875, "grad_norm": 1.5465562693732942, "kl": 0.0650634765625, "learning_rate": 5.74462890625e-07, "loss": 0.0026, "reward": 1.8406208753585815, "reward_std": 0.03312433697283268, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8406208753585815, "step": 3486 }, { "clip_ratio": 0.0, "completion_length": 345.4375, "epoch": 1.703125, "grad_norm": 1.025623413376822, "kl": 0.072998046875, "learning_rate": 5.743408203125e-07, "loss": 0.0029, "reward": 1.8625024557113647, "reward_std": 0.07112840935587883, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8703149855136871, "step": 3487 }, { "clip_ratio": 0.0, "completion_length": 371.1640625, "epoch": 1.70361328125, "grad_norm": 1.0215455164365324, "kl": 0.067138671875, "learning_rate": 5.7421875e-07, "loss": 0.0027, "reward": 1.6743749380111694, "reward_std": 0.1329372152686119, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.6899998188018799, "step": 3488 }, { "clip_ratio": 0.0, "completion_length": 290.6484375, "epoch": 1.7041015625, "grad_norm": 1.5182995549340972, "kl": 0.078125, "learning_rate": 5.740966796875e-07, "loss": 0.0031, "reward": 1.6961557269096375, "reward_std": 0.05626895558089018, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6961557865142822, "step": 3489 }, { "clip_ratio": 0.0, "completion_length": 281.34375, "epoch": 1.70458984375, "grad_norm": 1.501814717810848, "kl": 0.068603515625, "learning_rate": 5.73974609375e-07, "loss": 0.0027, "reward": 1.8220676183700562, "reward_std": 0.0694145429879427, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8220676481723785, "step": 3490 }, { "clip_ratio": 0.0, "completion_length": 230.234375, "epoch": 1.705078125, "grad_norm": 8.211844024368538, "kl": 0.090576171875, "learning_rate": 5.738525390624999e-07, "loss": 0.0036, "reward": 1.8239883780479431, "reward_std": 0.062240034341812134, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8239883780479431, "step": 3491 }, { "clip_ratio": 0.0, "completion_length": 295.4765625, "epoch": 1.70556640625, "grad_norm": 0.8810007015149223, "kl": 0.05517578125, "learning_rate": 5.7373046875e-07, "loss": 0.0022, "reward": 1.8131248354911804, "reward_std": 0.04876277968287468, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8131248354911804, "step": 3492 }, { "clip_ratio": 0.0, "completion_length": 257.046875, "epoch": 1.7060546875, "grad_norm": 1.3082128636150807, "kl": 0.087890625, "learning_rate": 5.736083984375e-07, "loss": 0.0035, "reward": 1.8320286870002747, "reward_std": 0.16402263939380646, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8476536571979523, "step": 3493 }, { "clip_ratio": 0.0, "completion_length": 304.2734375, "epoch": 1.70654296875, "grad_norm": 0.7233338633250107, "kl": 0.079833984375, "learning_rate": 5.73486328125e-07, "loss": 0.0032, "reward": 1.809335172176361, "reward_std": 0.07215743651613593, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8171476721763611, "step": 3494 }, { "clip_ratio": 0.0, "completion_length": 251.9453125, "epoch": 1.70703125, "grad_norm": 1.8643023308951008, "kl": 0.083984375, "learning_rate": 5.733642578125e-07, "loss": 0.0034, "reward": 1.7340399026870728, "reward_std": 0.03040897147729993, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7340399026870728, "step": 3495 }, { "clip_ratio": 0.0, "completion_length": 287.078125, "epoch": 1.70751953125, "grad_norm": 1.5851306415379258, "kl": 0.0732421875, "learning_rate": 5.732421875e-07, "loss": 0.0029, "reward": 1.7629672288894653, "reward_std": 0.053599401377141476, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7629671692848206, "step": 3496 }, { "clip_ratio": 0.0, "completion_length": 263.859375, "epoch": 1.7080078125, "grad_norm": 1.5268802541107716, "kl": 0.072021484375, "learning_rate": 5.731201171874999e-07, "loss": 0.0029, "reward": 1.8078510761260986, "reward_std": 0.049535930156707764, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8078510463237762, "step": 3497 }, { "clip_ratio": 0.0, "completion_length": 318.421875, "epoch": 1.70849609375, "grad_norm": 3.351870958415951, "kl": 0.0635986328125, "learning_rate": 5.729980468749999e-07, "loss": 0.0025, "reward": 1.794227421283722, "reward_std": 0.09365762025117874, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7942274212837219, "step": 3498 }, { "clip_ratio": 0.0, "completion_length": 353.359375, "epoch": 1.708984375, "grad_norm": 2.7206709323249125, "kl": 0.067626953125, "learning_rate": 5.728759765625e-07, "loss": 0.0027, "reward": 1.7822973728179932, "reward_std": 0.09830936044454575, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7822974026203156, "step": 3499 }, { "clip_ratio": 0.0, "completion_length": 302.0546875, "epoch": 1.70947265625, "grad_norm": 1.0980561954495642, "kl": 0.069580078125, "learning_rate": 5.7275390625e-07, "loss": 0.0028, "reward": 1.879291832447052, "reward_std": 0.05373461917042732, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8871042728424072, "step": 3500 }, { "clip_ratio": 0.0, "completion_length": 355.4921875, "epoch": 1.7099609375, "grad_norm": 1.7798040760161173, "kl": 0.07958984375, "learning_rate": 5.726318359375e-07, "loss": 0.0032, "reward": 1.7494339346885681, "reward_std": 0.07848425209522247, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7494339644908905, "step": 3501 }, { "clip_ratio": 0.0, "completion_length": 241.703125, "epoch": 1.71044921875, "grad_norm": 1.0485894953158654, "kl": 0.0657958984375, "learning_rate": 5.72509765625e-07, "loss": 0.0026, "reward": 1.7808015942573547, "reward_std": 0.01619276311248541, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7808016836643219, "step": 3502 }, { "clip_ratio": 0.0, "completion_length": 261.21875, "epoch": 1.7109375, "grad_norm": 1.2363428348153052, "kl": 0.0810546875, "learning_rate": 5.723876953125e-07, "loss": 0.0032, "reward": 1.6847857236862183, "reward_std": 0.052163584157824516, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6847857236862183, "step": 3503 }, { "clip_ratio": 0.0, "completion_length": 396.5, "epoch": 1.71142578125, "grad_norm": 1.0241359134056272, "kl": 0.04931640625, "learning_rate": 5.722656249999999e-07, "loss": 0.002, "reward": 1.8333210349082947, "reward_std": 0.06691450020298362, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8411335349082947, "step": 3504 }, { "clip_ratio": 0.0, "completion_length": 316.8125, "epoch": 1.7119140625, "grad_norm": 1.0599995273767435, "kl": 0.07568359375, "learning_rate": 5.721435546875e-07, "loss": 0.003, "reward": 1.7699226140975952, "reward_std": 0.04928914085030556, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7699226140975952, "step": 3505 }, { "clip_ratio": 0.0, "completion_length": 265.0390625, "epoch": 1.71240234375, "grad_norm": 0.9453226329364856, "kl": 0.0645751953125, "learning_rate": 5.72021484375e-07, "loss": 0.0026, "reward": 1.8773809671401978, "reward_std": 0.028110843151807785, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8773809373378754, "step": 3506 }, { "clip_ratio": 0.0, "completion_length": 309.484375, "epoch": 1.712890625, "grad_norm": 1.6786568278471627, "kl": 0.0679931640625, "learning_rate": 5.718994140625e-07, "loss": 0.0027, "reward": 1.7743658423423767, "reward_std": 0.03792189992964268, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7743658423423767, "step": 3507 }, { "clip_ratio": 0.0, "completion_length": 307.359375, "epoch": 1.71337890625, "grad_norm": 1.370461486241404, "kl": 0.075927734375, "learning_rate": 5.7177734375e-07, "loss": 0.003, "reward": 1.744931399822235, "reward_std": 0.09244660288095474, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7449314296245575, "step": 3508 }, { "clip_ratio": 0.0, "completion_length": 283.1171875, "epoch": 1.7138671875, "grad_norm": 2.7346113141185033, "kl": 0.069091796875, "learning_rate": 5.716552734375e-07, "loss": 0.0028, "reward": 1.7291991710662842, "reward_std": 0.04934484884142876, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7291992008686066, "step": 3509 }, { "clip_ratio": 0.0, "completion_length": 281.6171875, "epoch": 1.71435546875, "grad_norm": 1.4661207261525067, "kl": 0.0947265625, "learning_rate": 5.715332031249999e-07, "loss": 0.0038, "reward": 1.9682253003120422, "reward_std": 0.05683219991624355, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.9760377407073975, "step": 3510 }, { "clip_ratio": 0.0, "completion_length": 232.1328125, "epoch": 1.71484375, "grad_norm": 2.5955313987666124, "kl": 0.095703125, "learning_rate": 5.714111328124999e-07, "loss": 0.0038, "reward": 1.7580629587173462, "reward_std": 0.04326807055622339, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.758063018321991, "step": 3511 }, { "clip_ratio": 0.0, "completion_length": 287.90625, "epoch": 1.71533203125, "grad_norm": 3.7538484706080677, "kl": 0.078369140625, "learning_rate": 5.712890625e-07, "loss": 0.0031, "reward": 1.8174352049827576, "reward_std": 0.07365524023771286, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8252476751804352, "step": 3512 }, { "clip_ratio": 0.0, "completion_length": 339.0234375, "epoch": 1.7158203125, "grad_norm": 2.4016719264444077, "kl": 0.0732421875, "learning_rate": 5.711669921875e-07, "loss": 0.0029, "reward": 1.7750000953674316, "reward_std": 0.08003316074609756, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7750000655651093, "step": 3513 }, { "clip_ratio": 0.0, "completion_length": 277.546875, "epoch": 1.71630859375, "grad_norm": 1.4411311752581444, "kl": 0.0673828125, "learning_rate": 5.71044921875e-07, "loss": 0.0027, "reward": 1.8372459411621094, "reward_std": 0.05695566162467003, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8372458517551422, "step": 3514 }, { "clip_ratio": 0.0, "completion_length": 304.1953125, "epoch": 1.716796875, "grad_norm": 1.6044404016784177, "kl": 0.068359375, "learning_rate": 5.709228515625e-07, "loss": 0.0027, "reward": 1.7338838577270508, "reward_std": 0.08176321163773537, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7338838577270508, "step": 3515 }, { "clip_ratio": 0.0, "completion_length": 205.3203125, "epoch": 1.71728515625, "grad_norm": 2.0755482906159606, "kl": 0.0858154296875, "learning_rate": 5.7080078125e-07, "loss": 0.0034, "reward": 1.7466081380844116, "reward_std": 0.032042115926742554, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7466080784797668, "step": 3516 }, { "clip_ratio": 0.0, "completion_length": 250.8671875, "epoch": 1.7177734375, "grad_norm": 1.7631315601889124, "kl": 0.085693359375, "learning_rate": 5.706787109374999e-07, "loss": 0.0034, "reward": 1.7691351175308228, "reward_std": 0.05385134369134903, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7691351771354675, "step": 3517 }, { "clip_ratio": 0.0, "completion_length": 229.7734375, "epoch": 1.71826171875, "grad_norm": 1.0110099202676348, "kl": 0.092041015625, "learning_rate": 5.70556640625e-07, "loss": 0.0037, "reward": 1.6459341049194336, "reward_std": 0.09330805763602257, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6537465453147888, "step": 3518 }, { "clip_ratio": 0.0, "completion_length": 208.8046875, "epoch": 1.71875, "grad_norm": 1.0560140334835186, "kl": 0.083251953125, "learning_rate": 5.704345703125e-07, "loss": 0.0033, "reward": 1.9419002532958984, "reward_std": 0.08785379119217396, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.9419002532958984, "step": 3519 }, { "clip_ratio": 0.0, "completion_length": 314.8671875, "epoch": 1.71923828125, "grad_norm": 1.1441030012568587, "kl": 0.058837890625, "learning_rate": 5.703125e-07, "loss": 0.0024, "reward": 1.8219019174575806, "reward_std": 0.0309375561773777, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8219019174575806, "step": 3520 }, { "clip_ratio": 0.0, "completion_length": 369.03125, "epoch": 1.7197265625, "grad_norm": 2.3512579540857153, "kl": 0.08154296875, "learning_rate": 5.701904296875e-07, "loss": 0.0033, "reward": 1.7773525714874268, "reward_std": 0.04743030574172735, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7773525714874268, "step": 3521 }, { "clip_ratio": 0.0, "completion_length": 329.078125, "epoch": 1.72021484375, "grad_norm": 0.82600864833063, "kl": 0.069580078125, "learning_rate": 5.70068359375e-07, "loss": 0.0028, "reward": 1.7188506722450256, "reward_std": 0.06316574104130268, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.726663202047348, "step": 3522 }, { "clip_ratio": 0.0, "completion_length": 278.0625, "epoch": 1.720703125, "grad_norm": 2.0136715514754693, "kl": 0.092529296875, "learning_rate": 5.699462890624999e-07, "loss": 0.0037, "reward": 1.729398787021637, "reward_std": 0.07341841980814934, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7372111976146698, "step": 3523 }, { "clip_ratio": 0.0, "completion_length": 273.6171875, "epoch": 1.72119140625, "grad_norm": 1.2203906346471134, "kl": 0.0908203125, "learning_rate": 5.698242187499999e-07, "loss": 0.0036, "reward": 1.7553092241287231, "reward_std": 0.05260470602661371, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7553092241287231, "step": 3524 }, { "clip_ratio": 0.0, "completion_length": 319.890625, "epoch": 1.7216796875, "grad_norm": 0.9548802375780792, "kl": 0.10009765625, "learning_rate": 5.697021484375e-07, "loss": 0.004, "reward": 1.7244818210601807, "reward_std": 0.079419358051382, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7401067912578583, "step": 3525 }, { "clip_ratio": 0.0, "completion_length": 245.7890625, "epoch": 1.72216796875, "grad_norm": 1.743456302936007, "kl": 0.083984375, "learning_rate": 5.69580078125e-07, "loss": 0.0034, "reward": 1.7955304980278015, "reward_std": 0.028726408258080482, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7955304086208344, "step": 3526 }, { "clip_ratio": 0.0, "completion_length": 251.8828125, "epoch": 1.72265625, "grad_norm": 2.8631334970459283, "kl": 0.107421875, "learning_rate": 5.694580078125e-07, "loss": 0.0043, "reward": 1.710024654865265, "reward_std": 0.07605608738958836, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7100247442722321, "step": 3527 }, { "clip_ratio": 0.0, "completion_length": 274.09375, "epoch": 1.72314453125, "grad_norm": 11.37817101239002, "kl": 0.116455078125, "learning_rate": 5.693359375e-07, "loss": 0.0047, "reward": 1.753280758857727, "reward_std": 0.07618452608585358, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7532808184623718, "step": 3528 }, { "clip_ratio": 0.0, "completion_length": 295.734375, "epoch": 1.7236328125, "grad_norm": 0.8179633518643423, "kl": 0.0869140625, "learning_rate": 5.692138671875e-07, "loss": 0.0035, "reward": 1.7533798813819885, "reward_std": 0.024872629903256893, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7533798813819885, "step": 3529 }, { "clip_ratio": 0.0, "completion_length": 264.1953125, "epoch": 1.72412109375, "grad_norm": 1.6417406207916294, "kl": 0.072021484375, "learning_rate": 5.690917968749999e-07, "loss": 0.0029, "reward": 1.7788927555084229, "reward_std": 0.025608118914533406, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7788927257061005, "step": 3530 }, { "clip_ratio": 0.0, "completion_length": 329.4140625, "epoch": 1.724609375, "grad_norm": 0.9017859503866928, "kl": 0.073486328125, "learning_rate": 5.689697265625e-07, "loss": 0.003, "reward": 1.8279852867126465, "reward_std": 0.059508029371500015, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8279853165149689, "step": 3531 }, { "clip_ratio": 0.0, "completion_length": 225.515625, "epoch": 1.72509765625, "grad_norm": 2.5888358981187016, "kl": 0.0849609375, "learning_rate": 5.6884765625e-07, "loss": 0.0034, "reward": 1.8510370254516602, "reward_std": 0.018395755905658007, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8510370254516602, "step": 3532 }, { "clip_ratio": 0.0, "completion_length": 291.40625, "epoch": 1.7255859375, "grad_norm": 1.0428518646158351, "kl": 0.093017578125, "learning_rate": 5.687255859375e-07, "loss": 0.0037, "reward": 1.8244857788085938, "reward_std": 0.0313012283295393, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8244858086109161, "step": 3533 }, { "clip_ratio": 0.0, "completion_length": 287.140625, "epoch": 1.72607421875, "grad_norm": 2.3145569600287557, "kl": 0.0787353515625, "learning_rate": 5.68603515625e-07, "loss": 0.0031, "reward": 1.901319682598114, "reward_std": 0.029859434813261032, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.901319682598114, "step": 3534 }, { "clip_ratio": 0.0, "completion_length": 410.8828125, "epoch": 1.7265625, "grad_norm": 1.9450424606133498, "kl": 0.09033203125, "learning_rate": 5.684814453125e-07, "loss": 0.0036, "reward": 1.7189557552337646, "reward_std": 0.11504796147346497, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.7345808148384094, "step": 3535 }, { "clip_ratio": 0.0, "completion_length": 295.59375, "epoch": 1.72705078125, "grad_norm": 1.9911963089475972, "kl": 0.072265625, "learning_rate": 5.683593749999999e-07, "loss": 0.0029, "reward": 1.7401865720748901, "reward_std": 0.10161124914884567, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7401866316795349, "step": 3536 }, { "clip_ratio": 0.0, "completion_length": 267.9453125, "epoch": 1.7275390625, "grad_norm": 1.1839095765959295, "kl": 0.071044921875, "learning_rate": 5.682373046874999e-07, "loss": 0.0028, "reward": 1.889032244682312, "reward_std": 0.02803337760269642, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8890321254730225, "step": 3537 }, { "clip_ratio": 0.0, "completion_length": 264.671875, "epoch": 1.72802734375, "grad_norm": 1.7911653432373396, "kl": 0.09521484375, "learning_rate": 5.68115234375e-07, "loss": 0.0038, "reward": 1.6901207566261292, "reward_std": 0.03867476247251034, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6901208162307739, "step": 3538 }, { "clip_ratio": 0.0, "completion_length": 279.234375, "epoch": 1.728515625, "grad_norm": 1.9111408087116626, "kl": 0.088623046875, "learning_rate": 5.679931640625e-07, "loss": 0.0036, "reward": 1.7439436316490173, "reward_std": 0.078775430098176, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7439436316490173, "step": 3539 }, { "clip_ratio": 0.0, "completion_length": 314.4140625, "epoch": 1.72900390625, "grad_norm": 1.5678708925275149, "kl": 0.06494140625, "learning_rate": 5.6787109375e-07, "loss": 0.0026, "reward": 1.7854658365249634, "reward_std": 0.03933623246848583, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7854658663272858, "step": 3540 }, { "clip_ratio": 0.0, "completion_length": 327.328125, "epoch": 1.7294921875, "grad_norm": 14.526695575847024, "kl": 0.07275390625, "learning_rate": 5.677490234375e-07, "loss": 0.0029, "reward": 1.7927899956703186, "reward_std": 0.08604315388947725, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8006025552749634, "step": 3541 }, { "clip_ratio": 0.0, "completion_length": 323.2890625, "epoch": 1.72998046875, "grad_norm": 0.9340497276184674, "kl": 0.05615234375, "learning_rate": 5.67626953125e-07, "loss": 0.0022, "reward": 1.8815443515777588, "reward_std": 0.03566223941743374, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.881544291973114, "step": 3542 }, { "clip_ratio": 0.0, "completion_length": 278.09375, "epoch": 1.73046875, "grad_norm": 1.4170493602251024, "kl": 0.06884765625, "learning_rate": 5.675048828124999e-07, "loss": 0.0028, "reward": 1.8988550901412964, "reward_std": 0.02946687676012516, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.898855060338974, "step": 3543 }, { "clip_ratio": 0.0, "completion_length": 400.2109375, "epoch": 1.73095703125, "grad_norm": 1.6780326561924064, "kl": 0.07470703125, "learning_rate": 5.673828125e-07, "loss": 0.003, "reward": 1.7607104778289795, "reward_std": 0.05685322359204292, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7607105076313019, "step": 3544 }, { "clip_ratio": 0.0, "completion_length": 312.078125, "epoch": 1.7314453125, "grad_norm": 1.265920665539601, "kl": 0.07958984375, "learning_rate": 5.672607421875e-07, "loss": 0.0032, "reward": 1.660174310207367, "reward_std": 0.09357069805264473, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6679868102073669, "step": 3545 }, { "clip_ratio": 0.0, "completion_length": 255.890625, "epoch": 1.73193359375, "grad_norm": 1.3387333409709177, "kl": 0.084228515625, "learning_rate": 5.67138671875e-07, "loss": 0.0034, "reward": 1.723749816417694, "reward_std": 0.08172390796244144, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7237497866153717, "step": 3546 }, { "clip_ratio": 0.0, "completion_length": 240.3984375, "epoch": 1.732421875, "grad_norm": 6.581068369525777, "kl": 0.070556640625, "learning_rate": 5.670166015625e-07, "loss": 0.0028, "reward": 1.8678494691848755, "reward_std": 0.027833457104861736, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8678494393825531, "step": 3547 }, { "clip_ratio": 0.0, "completion_length": 335.8203125, "epoch": 1.73291015625, "grad_norm": 0.7849499263584183, "kl": 0.0703125, "learning_rate": 5.6689453125e-07, "loss": 0.0028, "reward": 1.8659499883651733, "reward_std": 0.0487942174077034, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8659500479698181, "step": 3548 }, { "clip_ratio": 0.0, "completion_length": 217.6640625, "epoch": 1.7333984375, "grad_norm": 0.8256068854796749, "kl": 0.054443359375, "learning_rate": 5.667724609374999e-07, "loss": 0.0022, "reward": 1.8360854387283325, "reward_std": 0.028709974139928818, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8360854387283325, "step": 3549 }, { "clip_ratio": 0.0, "completion_length": 323.9609375, "epoch": 1.73388671875, "grad_norm": 2.5622246174809873, "kl": 0.104736328125, "learning_rate": 5.666503906249999e-07, "loss": 0.0042, "reward": 1.6949216723442078, "reward_std": 0.07603111118078232, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6949216425418854, "step": 3550 }, { "clip_ratio": 0.0, "completion_length": 403.46875, "epoch": 1.734375, "grad_norm": 1.885269304797715, "kl": 0.0487060546875, "learning_rate": 5.665283203125e-07, "loss": 0.002, "reward": 1.800632357597351, "reward_std": 0.09131154417991638, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8006323575973511, "step": 3551 }, { "clip_ratio": 0.0, "completion_length": 263.890625, "epoch": 1.73486328125, "grad_norm": 1.2590360439221413, "kl": 0.071533203125, "learning_rate": 5.6640625e-07, "loss": 0.0029, "reward": 1.747345209121704, "reward_std": 0.024599829223006964, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7473451793193817, "step": 3552 }, { "clip_ratio": 0.0, "completion_length": 233.5859375, "epoch": 1.7353515625, "grad_norm": 1.7621451023351664, "kl": 0.0531005859375, "learning_rate": 5.662841796875e-07, "loss": 0.0021, "reward": 1.8149150013923645, "reward_std": 0.06236854917369783, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8149150013923645, "step": 3553 }, { "clip_ratio": 0.0, "completion_length": 297.03125, "epoch": 1.73583984375, "grad_norm": 1.9104865022494395, "kl": 0.06640625, "learning_rate": 5.66162109375e-07, "loss": 0.0027, "reward": 1.6441280841827393, "reward_std": 0.02519212942570448, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6441280245780945, "step": 3554 }, { "clip_ratio": 0.0, "completion_length": 329.5, "epoch": 1.736328125, "grad_norm": 1.2578790013200032, "kl": 0.063720703125, "learning_rate": 5.660400390625e-07, "loss": 0.0025, "reward": 1.6923622488975525, "reward_std": 0.08163776621222496, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7001748085021973, "step": 3555 }, { "clip_ratio": 0.0, "completion_length": 255.7734375, "epoch": 1.73681640625, "grad_norm": 2.277411615256479, "kl": 0.08447265625, "learning_rate": 5.659179687499999e-07, "loss": 0.0034, "reward": 1.6454498767852783, "reward_std": 0.05983481742441654, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6454498171806335, "step": 3556 }, { "clip_ratio": 0.0, "completion_length": 380.59375, "epoch": 1.7373046875, "grad_norm": 12.33623189322276, "kl": 0.0657958984375, "learning_rate": 5.657958984374999e-07, "loss": 0.0026, "reward": 1.7834393978118896, "reward_std": 0.042173080146312714, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7834393680095673, "step": 3557 }, { "clip_ratio": 0.0, "completion_length": 313.6015625, "epoch": 1.73779296875, "grad_norm": 1.2785230001853094, "kl": 0.066162109375, "learning_rate": 5.65673828125e-07, "loss": 0.0026, "reward": 1.8725386261940002, "reward_std": 0.038782306015491486, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8725386261940002, "step": 3558 }, { "clip_ratio": 0.0, "completion_length": 293.859375, "epoch": 1.73828125, "grad_norm": 1.4368144722786214, "kl": 0.0693359375, "learning_rate": 5.655517578125e-07, "loss": 0.0028, "reward": 1.828158974647522, "reward_std": 0.07489650882780552, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.828158974647522, "step": 3559 }, { "clip_ratio": 0.0, "completion_length": 403.4375, "epoch": 1.73876953125, "grad_norm": 7.317107973230845, "kl": 0.055908203125, "learning_rate": 5.654296875e-07, "loss": 0.0022, "reward": 1.8085799813270569, "reward_std": 0.03719876706600189, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8085800111293793, "step": 3560 }, { "clip_ratio": 0.0, "completion_length": 250.8359375, "epoch": 1.7392578125, "grad_norm": 1.1775016795529225, "kl": 0.1083984375, "learning_rate": 5.653076171875e-07, "loss": 0.0043, "reward": 1.7262591123580933, "reward_std": 0.037734927609562874, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.726259171962738, "step": 3561 }, { "clip_ratio": 0.0, "completion_length": 248.421875, "epoch": 1.73974609375, "grad_norm": 1.7178106445382675, "kl": 0.082763671875, "learning_rate": 5.65185546875e-07, "loss": 0.0033, "reward": 1.6607239246368408, "reward_std": 0.06611571833491325, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6607239246368408, "step": 3562 }, { "clip_ratio": 0.0, "completion_length": 262.4375, "epoch": 1.740234375, "grad_norm": 3.1487378984733274, "kl": 0.091064453125, "learning_rate": 5.650634765624999e-07, "loss": 0.0036, "reward": 1.8339160084724426, "reward_std": 0.03272883594036102, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8339160084724426, "step": 3563 }, { "clip_ratio": 0.0, "completion_length": 208.1328125, "epoch": 1.74072265625, "grad_norm": 1.7524697236338946, "kl": 0.0654296875, "learning_rate": 5.6494140625e-07, "loss": 0.0026, "reward": 1.8101386427879333, "reward_std": 0.03559943049913272, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8101385533809662, "step": 3564 }, { "clip_ratio": 0.0, "completion_length": 287.9609375, "epoch": 1.7412109375, "grad_norm": 1.0956422342478667, "kl": 0.072509765625, "learning_rate": 5.648193359375e-07, "loss": 0.0029, "reward": 1.8012146949768066, "reward_std": 0.020907348953187466, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8012146949768066, "step": 3565 }, { "clip_ratio": 0.0, "completion_length": 281.9375, "epoch": 1.74169921875, "grad_norm": 1.1328692001884817, "kl": 0.08349609375, "learning_rate": 5.64697265625e-07, "loss": 0.0033, "reward": 1.7395102381706238, "reward_std": 0.046815380454063416, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7395102381706238, "step": 3566 }, { "clip_ratio": 0.0, "completion_length": 249.6875, "epoch": 1.7421875, "grad_norm": 8.860759098488048, "kl": 0.0732421875, "learning_rate": 5.645751953125e-07, "loss": 0.0029, "reward": 1.7368816137313843, "reward_std": 0.061663146945647895, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7368816435337067, "step": 3567 }, { "clip_ratio": 0.0, "completion_length": 357.5625, "epoch": 1.74267578125, "grad_norm": 5.322487155817771, "kl": 0.058349609375, "learning_rate": 5.64453125e-07, "loss": 0.0023, "reward": 1.7705302238464355, "reward_std": 0.09227291122078896, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7783427238464355, "step": 3568 }, { "clip_ratio": 0.0, "completion_length": 274.828125, "epoch": 1.7431640625, "grad_norm": 0.9319050653167747, "kl": 0.0540771484375, "learning_rate": 5.643310546874999e-07, "loss": 0.0022, "reward": 1.7691416144371033, "reward_std": 0.027585056610405445, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7691416144371033, "step": 3569 }, { "clip_ratio": 0.0, "completion_length": 235.6015625, "epoch": 1.74365234375, "grad_norm": 1.2001691656551983, "kl": 0.06689453125, "learning_rate": 5.642089843749999e-07, "loss": 0.0027, "reward": 1.7046304941177368, "reward_std": 0.06601490080356598, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7046305537223816, "step": 3570 }, { "clip_ratio": 0.0, "completion_length": 308.859375, "epoch": 1.744140625, "grad_norm": 1.88195678513803, "kl": 0.068359375, "learning_rate": 5.640869140625e-07, "loss": 0.0027, "reward": 1.8015184998512268, "reward_std": 0.11520305648446083, "rewards/format_reward": 0.984375, "rewards/ocr_reward": 0.8171434998512268, "step": 3571 }, { "clip_ratio": 0.0, "completion_length": 287.1484375, "epoch": 1.74462890625, "grad_norm": 1.3393368677542685, "kl": 0.0760498046875, "learning_rate": 5.6396484375e-07, "loss": 0.003, "reward": 1.6661378145217896, "reward_std": 0.028218165040016174, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6661378145217896, "step": 3572 }, { "clip_ratio": 0.0, "completion_length": 250.4296875, "epoch": 1.7451171875, "grad_norm": 1.3605222672595276, "kl": 0.079833984375, "learning_rate": 5.638427734375e-07, "loss": 0.0032, "reward": 1.737298607826233, "reward_std": 0.07310641929507256, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7372985780239105, "step": 3573 }, { "clip_ratio": 0.0, "completion_length": 341.9921875, "epoch": 1.74560546875, "grad_norm": 1.103577603969494, "kl": 0.07373046875, "learning_rate": 5.63720703125e-07, "loss": 0.003, "reward": 1.732740879058838, "reward_std": 0.06374066509306431, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7327408790588379, "step": 3574 }, { "clip_ratio": 0.0, "completion_length": 240.5546875, "epoch": 1.74609375, "grad_norm": 1.31741473565297, "kl": 0.074462890625, "learning_rate": 5.635986328125e-07, "loss": 0.003, "reward": 1.8524270057678223, "reward_std": 0.041487690061330795, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8524269759654999, "step": 3575 }, { "clip_ratio": 0.0, "completion_length": 298.203125, "epoch": 1.74658203125, "grad_norm": 1.5223059068844398, "kl": 0.083984375, "learning_rate": 5.634765624999999e-07, "loss": 0.0034, "reward": 1.6632152795791626, "reward_std": 0.053207699209451675, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6632152497768402, "step": 3576 }, { "clip_ratio": 0.0, "completion_length": 309.7421875, "epoch": 1.7470703125, "grad_norm": 3.2437260416145666, "kl": 0.0511474609375, "learning_rate": 5.633544921875e-07, "loss": 0.002, "reward": 1.8188891410827637, "reward_std": 0.09965669736266136, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8188891410827637, "step": 3577 }, { "clip_ratio": 0.0, "completion_length": 325.2734375, "epoch": 1.74755859375, "grad_norm": 8.572121114129786, "kl": 0.04833984375, "learning_rate": 5.63232421875e-07, "loss": 0.0019, "reward": 1.895998477935791, "reward_std": 0.02911460120230913, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.895998477935791, "step": 3578 }, { "clip_ratio": 0.0, "completion_length": 293.984375, "epoch": 1.748046875, "grad_norm": 1.1357685130160955, "kl": 0.058349609375, "learning_rate": 5.631103515625e-07, "loss": 0.0023, "reward": 1.8604804277420044, "reward_std": 0.03513455484062433, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.860480397939682, "step": 3579 }, { "clip_ratio": 0.0, "completion_length": 252.7265625, "epoch": 1.74853515625, "grad_norm": 1.7682231539196884, "kl": 0.05908203125, "learning_rate": 5.6298828125e-07, "loss": 0.0024, "reward": 1.8791704773902893, "reward_std": 0.033109684474766254, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8791704773902893, "step": 3580 }, { "clip_ratio": 0.0, "completion_length": 235.234375, "epoch": 1.7490234375, "grad_norm": 17.460259222379793, "kl": 0.076171875, "learning_rate": 5.628662109375e-07, "loss": 0.003, "reward": 1.7239627838134766, "reward_std": 0.05568823218345642, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7239627540111542, "step": 3581 }, { "clip_ratio": 0.0, "completion_length": 325.59375, "epoch": 1.74951171875, "grad_norm": 10.60035752233288, "kl": 0.07958984375, "learning_rate": 5.627441406249999e-07, "loss": 0.0032, "reward": 1.8284227848052979, "reward_std": 0.09722843207418919, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.8362353146076202, "step": 3582 }, { "clip_ratio": 0.0, "completion_length": 322.375, "epoch": 1.75, "grad_norm": 1.4466112179411692, "kl": 0.0595703125, "learning_rate": 5.626220703124999e-07, "loss": 0.0024, "reward": 1.7760446071624756, "reward_std": 0.0447351299226284, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7760446071624756, "step": 3583 }, { "clip_ratio": 0.0, "completion_length": 313.5234375, "epoch": 1.75048828125, "grad_norm": 1.2053351036061268, "kl": 0.083740234375, "learning_rate": 5.625e-07, "loss": 0.0033, "reward": 1.8212696313858032, "reward_std": 0.08149140700697899, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.821269690990448, "step": 3584 }, { "clip_ratio": 0.0, "completion_length": 287.5859375, "epoch": 1.7509765625, "grad_norm": 4.851214185716174, "kl": 0.083984375, "learning_rate": 5.623779296875e-07, "loss": 0.0034, "reward": 1.7441505193710327, "reward_std": 0.1517154574394226, "rewards/format_reward": 0.9765625, "rewards/ocr_reward": 0.7675879597663879, "step": 3585 }, { "clip_ratio": 0.0, "completion_length": 268.5625, "epoch": 1.75146484375, "grad_norm": 1.179914259413158, "kl": 0.07763671875, "learning_rate": 5.62255859375e-07, "loss": 0.0031, "reward": 1.7303178310394287, "reward_std": 0.047688692808151245, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7303178906440735, "step": 3586 }, { "clip_ratio": 0.0, "completion_length": 360.140625, "epoch": 1.751953125, "grad_norm": 1.8342388609879379, "kl": 0.0589599609375, "learning_rate": 5.621337890625e-07, "loss": 0.0024, "reward": 1.7912532687187195, "reward_std": 0.0980726070702076, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7990657389163971, "step": 3587 }, { "clip_ratio": 0.0, "completion_length": 357.1875, "epoch": 1.75244140625, "grad_norm": 1.7620888034734943, "kl": 0.071533203125, "learning_rate": 5.6201171875e-07, "loss": 0.0029, "reward": 1.7349724173545837, "reward_std": 0.14423664659261703, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.742784857749939, "step": 3588 }, { "clip_ratio": 0.0, "completion_length": 297.65625, "epoch": 1.7529296875, "grad_norm": 0.9893845914550516, "kl": 0.0682373046875, "learning_rate": 5.618896484374999e-07, "loss": 0.0027, "reward": 1.697411596775055, "reward_std": 0.023635744117200375, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6974115371704102, "step": 3589 }, { "clip_ratio": 0.0, "completion_length": 289.0390625, "epoch": 1.75341796875, "grad_norm": 1.3535344923489823, "kl": 0.0555419921875, "learning_rate": 5.61767578125e-07, "loss": 0.0022, "reward": 1.799051284790039, "reward_std": 0.01989690400660038, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7990512847900391, "step": 3590 }, { "clip_ratio": 0.0, "completion_length": 326.390625, "epoch": 1.75390625, "grad_norm": 1.3581378000044912, "kl": 0.063720703125, "learning_rate": 5.616455078125e-07, "loss": 0.0025, "reward": 1.6952258944511414, "reward_std": 0.03106315340846777, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.6952258944511414, "step": 3591 }, { "clip_ratio": 0.0, "completion_length": 269.6171875, "epoch": 1.75439453125, "grad_norm": 2.5388820892946122, "kl": 0.08154296875, "learning_rate": 5.615234375e-07, "loss": 0.0033, "reward": 1.7009756565093994, "reward_std": 0.09824825078248978, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.7087881565093994, "step": 3592 }, { "clip_ratio": 0.0, "completion_length": 333.53125, "epoch": 1.7548828125, "grad_norm": 1.7898213356726147, "kl": 0.0576171875, "learning_rate": 5.614013671875e-07, "loss": 0.0023, "reward": 1.7667632102966309, "reward_std": 0.038947849068790674, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7667632400989532, "step": 3593 }, { "clip_ratio": 0.0, "completion_length": 289.9140625, "epoch": 1.75537109375, "grad_norm": 1.4385952936986826, "kl": 0.0592041015625, "learning_rate": 5.61279296875e-07, "loss": 0.0024, "reward": 1.7931809425354004, "reward_std": 0.05069575086236, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7931809425354004, "step": 3594 }, { "clip_ratio": 0.0, "completion_length": 286.03125, "epoch": 1.755859375, "grad_norm": 2.1819507976548147, "kl": 0.074951171875, "learning_rate": 5.611572265624999e-07, "loss": 0.003, "reward": 1.8132346272468567, "reward_std": 0.040440889075398445, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8132346570491791, "step": 3595 }, { "clip_ratio": 0.0, "completion_length": 246.1328125, "epoch": 1.75634765625, "grad_norm": 1.2064973111450872, "kl": 0.078857421875, "learning_rate": 5.610351562499999e-07, "loss": 0.0031, "reward": 1.7098599076271057, "reward_std": 0.0605672225356102, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7098599076271057, "step": 3596 }, { "clip_ratio": 0.0, "completion_length": 211.921875, "epoch": 1.7568359375, "grad_norm": 1.7526125144836613, "kl": 0.076171875, "learning_rate": 5.609130859375e-07, "loss": 0.003, "reward": 1.747939109802246, "reward_std": 0.07001195242628455, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7479391098022461, "step": 3597 }, { "clip_ratio": 0.0, "completion_length": 316.046875, "epoch": 1.75732421875, "grad_norm": 1.0693339957223436, "kl": 0.0582275390625, "learning_rate": 5.60791015625e-07, "loss": 0.0023, "reward": 1.6757773160934448, "reward_std": 0.08137864619493484, "rewards/format_reward": 0.9921875, "rewards/ocr_reward": 0.6835898458957672, "step": 3598 }, { "clip_ratio": 0.0, "completion_length": 243.8984375, "epoch": 1.7578125, "grad_norm": 0.9296572610348767, "kl": 0.071533203125, "learning_rate": 5.606689453125e-07, "loss": 0.0029, "reward": 1.8186118006706238, "reward_std": 0.07801494561135769, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.8186118006706238, "step": 3599 }, { "clip_ratio": 0.0, "completion_length": 229.2578125, "epoch": 1.75830078125, "grad_norm": 1.7734580005970337, "kl": 0.070068359375, "learning_rate": 5.60546875e-07, "loss": 0.0028, "reward": 1.757921278476715, "reward_std": 0.05061543360352516, "rewards/format_reward": 1.0, "rewards/ocr_reward": 0.7579212486743927, "step": 3600 } ], "logging_steps": 1.0, "max_steps": 8192, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }