{ "best_metric": 0.7826179817318917, "best_model_checkpoint": "/autodl-fs/data/06_Phi4_GRPO/output/restore_MIMIC_Think_phi4_ep1_lr6e-5/v115-20251109-084105/checkpoint-4500", "epoch": 0.9090909090909091, "eval_steps": 500, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 117.0, "completions/mean_length": 84.0, "completions/min_length": 68.0, "epoch": 0.00020202020202020202, "grad_norm": 1.4127418994903564, "kl": 0.004815803375095129, "learning_rate": 4.0404040404040405e-09, "loss": -0.1671195775270462, "memory(GiB)": 63.61, "reward": 0.16490910947322845, "reward_std": 0.07778701931238174, "rewards/MultiModalAccuracyORM_Any/mean": 0.0, "rewards/MultiModalAccuracyORM_Any/std": 0.0, "rewards/ReportKG_Jaccard/mean": 0.16490910947322845, "rewards/ReportKG_Jaccard/std": 0.07778701931238174, "step": 1, "train_speed(iter/s)": 0.034829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.0, "completions/mean_length": 76.90625, "completions/min_length": 50.0, "epoch": 0.00101010101010101, "grad_norm": 1.8746058940887451, "kl": 0.0038225402822718024, "learning_rate": 2.0202020202020204e-08, "loss": -0.035215720534324646, "memory(GiB)": 67.17, "reward": 0.599979817867279, "reward_std": 0.385937443934381, "rewards/MultiModalAccuracyORM_Any/mean": 0.4375, "rewards/MultiModalAccuracyORM_Any/std": 0.3335031494498253, "rewards/ReportKG_Jaccard/mean": 0.16247981786727905, "rewards/ReportKG_Jaccard/std": 0.05988267809152603, "step": 5, "train_speed(iter/s)": 0.039209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.2, "completions/mean_length": 75.75, "completions/min_length": 53.6, "epoch": 0.00202020202020202, "grad_norm": 1.112656593322754, "kl": 0.004958589188754558, "learning_rate": 4.040404040404041e-08, "loss": -0.027715593576431274, "memory(GiB)": 67.5, "reward": 0.7534679055213929, "reward_std": 0.5073182046413421, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.17846790254116057, "rewards/ReportKG_Jaccard/std": 0.0594916470348835, "step": 10, "train_speed(iter/s)": 0.041176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.4, "completions/mean_length": 69.2, "completions/min_length": 50.0, "epoch": 0.0030303030303030303, "grad_norm": 2.2435855865478516, "kl": 0.004562332900241018, "learning_rate": 6.060606060606061e-08, "loss": -0.01468561440706253, "memory(GiB)": 67.5, "reward": 0.7105139255523681, "reward_std": 0.5138860166072845, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.1855139285326004, "rewards/ReportKG_Jaccard/std": 0.07875369824469089, "step": 15, "train_speed(iter/s)": 0.040621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.8, "completions/mean_length": 68.25, "completions/min_length": 50.2, "epoch": 0.00404040404040404, "grad_norm": 1.73319411277771, "kl": 0.0043660292867571115, "learning_rate": 8.080808080808082e-08, "loss": -0.06095001697540283, "memory(GiB)": 67.5, "reward": 0.563041341304779, "reward_std": 0.4396173059940338, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.2130413293838501, "rewards/ReportKG_Jaccard/std": 0.09388308972120285, "step": 20, "train_speed(iter/s)": 0.042025 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.6, "completions/mean_length": 83.775, "completions/min_length": 70.8, "epoch": 0.005050505050505051, "grad_norm": 1.1845703125, "kl": 0.0038998892065137624, "learning_rate": 1.01010101010101e-07, "loss": 0.024488487839698793, "memory(GiB)": 67.5, "reward": 0.6538947641849517, "reward_std": 0.39821062982082367, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.27889477014541625, "rewards/ReportKG_Jaccard/std": 0.059872113168239594, "step": 25, "train_speed(iter/s)": 0.043153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.4, "completions/mean_length": 72.05, "completions/min_length": 46.8, "epoch": 0.006060606060606061, "grad_norm": 1.4393625259399414, "kl": 0.004924404015764594, "learning_rate": 1.2121212121212122e-07, "loss": 0.03639696836471558, "memory(GiB)": 67.5, "reward": 0.661827564239502, "reward_std": 0.5068373441696167, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.47382218241691587, "rewards/ReportKG_Jaccard/mean": 0.2118275672197342, "rewards/ReportKG_Jaccard/std": 0.07933988720178604, "step": 30, "train_speed(iter/s)": 0.043156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 72.95, "completions/min_length": 50.4, "epoch": 0.007070707070707071, "grad_norm": 1.86654794216156, "kl": 0.004342849226668477, "learning_rate": 1.4141414141414141e-07, "loss": -0.0023222237825393675, "memory(GiB)": 67.5, "reward": 0.6991829037666321, "reward_std": 0.4987104177474976, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.17418287843465804, "rewards/ReportKG_Jaccard/std": 0.06309234574437142, "step": 35, "train_speed(iter/s)": 0.042351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.2, "completions/mean_length": 68.125, "completions/min_length": 53.2, "epoch": 0.00808080808080808, "grad_norm": 2.6194019317626953, "kl": 0.00416156854480505, "learning_rate": 1.6161616161616163e-07, "loss": 0.008672840893268585, "memory(GiB)": 67.5, "reward": 0.7364040315151215, "reward_std": 0.4371423006057739, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.2364040046930313, "rewards/ReportKG_Jaccard/std": 0.05627602264285088, "step": 40, "train_speed(iter/s)": 0.043517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.0, "completions/mean_length": 72.875, "completions/min_length": 51.0, "epoch": 0.00909090909090909, "grad_norm": 1.2192723751068115, "kl": 0.00365946046076715, "learning_rate": 1.818181818181818e-07, "loss": -0.019394551217556, "memory(GiB)": 67.5, "reward": 0.6059627175331116, "reward_std": 0.41519854962825775, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.3703280448913574, "rewards/ReportKG_Jaccard/mean": 0.20596272349357606, "rewards/ReportKG_Jaccard/std": 0.0639511827379465, "step": 45, "train_speed(iter/s)": 0.041183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 76.2, "completions/min_length": 59.8, "epoch": 0.010101010101010102, "grad_norm": 1.476742148399353, "kl": 0.003394790319725871, "learning_rate": 2.02020202020202e-07, "loss": -0.06444811820983887, "memory(GiB)": 67.5, "reward": 0.6455698370933532, "reward_std": 0.4884597361087799, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.46289436221122743, "rewards/ReportKG_Jaccard/mean": 0.2705698311328888, "rewards/ReportKG_Jaccard/std": 0.04980254322290421, "step": 50, "train_speed(iter/s)": 0.042019 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 111.2, "completions/mean_length": 75.4, "completions/min_length": 49.6, "epoch": 0.011111111111111112, "grad_norm": 1.59047269821167, "kl": 0.004284722078591585, "learning_rate": 2.222222222222222e-07, "loss": -0.06152915954589844, "memory(GiB)": 67.84, "reward": 0.5289235711097717, "reward_std": 0.5054871141910553, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.15392358526587485, "rewards/ReportKG_Jaccard/std": 0.04885574579238892, "step": 55, "train_speed(iter/s)": 0.041286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.8, "completions/mean_length": 86.975, "completions/min_length": 68.8, "epoch": 0.012121212121212121, "grad_norm": 1.1595656871795654, "kl": 0.0042577513959258795, "learning_rate": 2.4242424242424244e-07, "loss": -0.017993266880512237, "memory(GiB)": 67.84, "reward": 0.67302086353302, "reward_std": 0.4462469041347504, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.43348987102508546, "rewards/ReportKG_Jaccard/mean": 0.22302087545394897, "rewards/ReportKG_Jaccard/std": 0.04100481010973454, "step": 60, "train_speed(iter/s)": 0.041333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.8, "completions/mean_length": 77.125, "completions/min_length": 56.6, "epoch": 0.013131313131313131, "grad_norm": 1.1910372972488403, "kl": 0.004867389146238565, "learning_rate": 2.6262626262626266e-07, "loss": 0.0025691017508506775, "memory(GiB)": 67.84, "reward": 0.7719671964645386, "reward_std": 0.5353692233562469, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.4956935167312622, "rewards/ReportKG_Jaccard/mean": 0.2469671845436096, "rewards/ReportKG_Jaccard/std": 0.06431892439723015, "step": 65, "train_speed(iter/s)": 0.042031 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.0, "completions/mean_length": 76.225, "completions/min_length": 56.8, "epoch": 0.014141414141414142, "grad_norm": 1.454534888267517, "kl": 0.004679276328533888, "learning_rate": 2.8282828282828283e-07, "loss": -0.029844245314598082, "memory(GiB)": 67.84, "reward": 0.47684150338172915, "reward_std": 0.3850406274199486, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.17684151828289033, "rewards/ReportKG_Jaccard/std": 0.062703488022089, "step": 70, "train_speed(iter/s)": 0.041352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.2, "completions/mean_length": 73.925, "completions/min_length": 53.6, "epoch": 0.015151515151515152, "grad_norm": 1.1838680505752563, "kl": 0.004495792789384723, "learning_rate": 3.0303030303030305e-07, "loss": -0.058748120069503786, "memory(GiB)": 67.84, "reward": 0.5322676569223403, "reward_std": 0.345251139998436, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.23226765245199205, "rewards/ReportKG_Jaccard/std": 0.06511425673961639, "step": 75, "train_speed(iter/s)": 0.041679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.8, "completions/mean_length": 72.625, "completions/min_length": 52.6, "epoch": 0.01616161616161616, "grad_norm": 1.4972236156463623, "kl": 0.004315123101696372, "learning_rate": 3.2323232323232327e-07, "loss": 0.0033837072551250458, "memory(GiB)": 67.84, "reward": 0.3833277016878128, "reward_std": 0.3001883625984192, "rewards/MultiModalAccuracyORM_Any/mean": 0.15, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.23332769870758058, "rewards/ReportKG_Jaccard/std": 0.05732347741723061, "step": 80, "train_speed(iter/s)": 0.042332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.2, "completions/mean_length": 73.85, "completions/min_length": 55.0, "epoch": 0.01717171717171717, "grad_norm": 1.4066276550292969, "kl": 0.004072431102395057, "learning_rate": 3.434343434343434e-07, "loss": -0.0024571143090724944, "memory(GiB)": 67.84, "reward": 0.5993545472621917, "reward_std": 0.47502520084381106, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.46289436221122743, "rewards/ReportKG_Jaccard/mean": 0.12435454577207565, "rewards/ReportKG_Jaccard/std": 0.046477243304252625, "step": 85, "train_speed(iter/s)": 0.042751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.2, "completions/mean_length": 70.35, "completions/min_length": 49.4, "epoch": 0.01818181818181818, "grad_norm": 1.1000266075134277, "kl": 0.0049742131493985655, "learning_rate": 3.636363636363636e-07, "loss": -0.01562841087579727, "memory(GiB)": 67.84, "reward": 0.5183968096971512, "reward_std": 0.40814904421567916, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.16839680671691895, "rewards/ReportKG_Jaccard/std": 0.04966699853539467, "step": 90, "train_speed(iter/s)": 0.04281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.0, "completions/mean_length": 68.825, "completions/min_length": 53.6, "epoch": 0.01919191919191919, "grad_norm": 1.7257260084152222, "kl": 0.0037511582020670177, "learning_rate": 3.8383838383838377e-07, "loss": 0.014930522441864014, "memory(GiB)": 67.84, "reward": 0.5378001511096955, "reward_std": 0.5093084633350372, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.21280015408992767, "rewards/ReportKG_Jaccard/std": 0.06836327016353608, "step": 95, "train_speed(iter/s)": 0.043056 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.6, "completions/mean_length": 65.65, "completions/min_length": 51.4, "epoch": 0.020202020202020204, "grad_norm": 1.6906191110610962, "kl": 0.004469379177317023, "learning_rate": 4.04040404040404e-07, "loss": -0.03460813760757446, "memory(GiB)": 67.84, "reward": 0.6896730184555053, "reward_std": 0.5434396445751191, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.5100160002708435, "rewards/ReportKG_Jaccard/mean": 0.2146730273962021, "rewards/ReportKG_Jaccard/std": 0.05881467089056969, "step": 100, "train_speed(iter/s)": 0.043171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 74.4, "completions/mean_length": 61.75, "completions/min_length": 47.8, "epoch": 0.021212121212121213, "grad_norm": 1.6725959777832031, "kl": 0.0035231282468885182, "learning_rate": 4.242424242424242e-07, "loss": -0.04622234702110291, "memory(GiB)": 67.84, "reward": 0.532874870300293, "reward_std": 0.48736907839775084, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.4956935167312622, "rewards/ReportKG_Jaccard/mean": 0.2078748971223831, "rewards/ReportKG_Jaccard/std": 0.04059442542493343, "step": 105, "train_speed(iter/s)": 0.043647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.6, "completions/mean_length": 75.85, "completions/min_length": 56.4, "epoch": 0.022222222222222223, "grad_norm": 1.1638422012329102, "kl": 0.004266635701060295, "learning_rate": 4.444444444444444e-07, "loss": 0.03219068646430969, "memory(GiB)": 67.84, "reward": 0.4802082121372223, "reward_std": 0.482258665561676, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.45536120533943175, "rewards/ReportKG_Jaccard/mean": 0.20520820319652558, "rewards/ReportKG_Jaccard/std": 0.0679511696100235, "step": 110, "train_speed(iter/s)": 0.043737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.8, "completions/mean_length": 73.325, "completions/min_length": 61.2, "epoch": 0.023232323232323233, "grad_norm": 1.4547399282455444, "kl": 0.004058712162077427, "learning_rate": 4.646464646464646e-07, "loss": -0.04502758979797363, "memory(GiB)": 67.84, "reward": 0.7532603740692139, "reward_std": 0.5362311720848083, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.48816035985946654, "rewards/ReportKG_Jaccard/mean": 0.2282603770494461, "rewards/ReportKG_Jaccard/std": 0.07706203982234001, "step": 115, "train_speed(iter/s)": 0.044012 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/mean_length": 67.4, "completions/min_length": 52.8, "epoch": 0.024242424242424242, "grad_norm": 1.5059096813201904, "kl": 0.003821024764329195, "learning_rate": 4.848484848484849e-07, "loss": -0.021328425407409667, "memory(GiB)": 67.84, "reward": 0.6756758093833923, "reward_std": 0.5224064469337464, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.48816035985946654, "rewards/ReportKG_Jaccard/mean": 0.200675830245018, "rewards/ReportKG_Jaccard/std": 0.0696561649441719, "step": 120, "train_speed(iter/s)": 0.044208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.8, "completions/mean_length": 76.2, "completions/min_length": 60.8, "epoch": 0.025252525252525252, "grad_norm": 1.6129601001739502, "kl": 0.003924520174041391, "learning_rate": 5.05050505050505e-07, "loss": 0.01018531620502472, "memory(GiB)": 67.84, "reward": 0.5182617247104645, "reward_std": 0.45195512771606444, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.14326172620058059, "rewards/ReportKG_Jaccard/std": 0.04472508244216442, "step": 125, "train_speed(iter/s)": 0.043994 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 65.825, "completions/min_length": 48.8, "epoch": 0.026262626262626262, "grad_norm": 1.4397454261779785, "kl": 0.004298774339258671, "learning_rate": 5.252525252525253e-07, "loss": 0.010259199142456054, "memory(GiB)": 67.84, "reward": 0.5421425759792328, "reward_std": 0.4445886194705963, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.21714258044958115, "rewards/ReportKG_Jaccard/std": 0.050442248955368994, "step": 130, "train_speed(iter/s)": 0.04429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.8, "completions/mean_length": 67.1, "completions/min_length": 52.0, "epoch": 0.02727272727272727, "grad_norm": 1.785538673400879, "kl": 0.004834099486470222, "learning_rate": 5.454545454545454e-07, "loss": -0.004743068292737007, "memory(GiB)": 67.84, "reward": 0.7585354804992676, "reward_std": 0.4752891719341278, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.2335354879498482, "rewards/ReportKG_Jaccard/std": 0.07157066017389298, "step": 135, "train_speed(iter/s)": 0.04459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.4, "completions/mean_length": 70.575, "completions/min_length": 47.4, "epoch": 0.028282828282828285, "grad_norm": 1.3536955118179321, "kl": 0.004887647787109017, "learning_rate": 5.656565656565657e-07, "loss": 0.0803380012512207, "memory(GiB)": 67.84, "reward": 0.5009036660194397, "reward_std": 0.3897562935948372, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.2259036660194397, "rewards/ReportKG_Jaccard/std": 0.06713246181607246, "step": 140, "train_speed(iter/s)": 0.044516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.6, "completions/mean_length": 69.125, "completions/min_length": 54.0, "epoch": 0.029292929292929294, "grad_norm": 1.662580966949463, "kl": 0.004793228255584836, "learning_rate": 5.858585858585858e-07, "loss": -0.009612908959388733, "memory(GiB)": 67.84, "reward": 0.5637141168117523, "reward_std": 0.5127399921417236, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.4881446659564972, "rewards/ReportKG_Jaccard/mean": 0.16371410936117173, "rewards/ReportKG_Jaccard/std": 0.04976056441664696, "step": 145, "train_speed(iter/s)": 0.044788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.6, "completions/mean_length": 68.2, "completions/min_length": 52.6, "epoch": 0.030303030303030304, "grad_norm": 1.6989446878433228, "kl": 0.005064842524006963, "learning_rate": 6.060606060606061e-07, "loss": -0.07161028981208802, "memory(GiB)": 67.84, "reward": 0.7297674775123596, "reward_std": 0.3287062034010887, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.2777460336685181, "rewards/ReportKG_Jaccard/mean": 0.1797674648463726, "rewards/ReportKG_Jaccard/std": 0.06894566752016544, "step": 150, "train_speed(iter/s)": 0.045086 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.2, "completions/mean_length": 79.1, "completions/min_length": 59.2, "epoch": 0.031313131313131314, "grad_norm": 1.1203267574310303, "kl": 0.004146010940894484, "learning_rate": 6.262626262626263e-07, "loss": -0.011180748790502548, "memory(GiB)": 67.84, "reward": 0.600795179605484, "reward_std": 0.4686845600605011, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.17579516172409057, "rewards/ReportKG_Jaccard/std": 0.04953835904598236, "step": 155, "train_speed(iter/s)": 0.045169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.2, "completions/mean_length": 76.025, "completions/min_length": 60.0, "epoch": 0.03232323232323232, "grad_norm": 1.0947939157485962, "kl": 0.003965690266340971, "learning_rate": 6.464646464646465e-07, "loss": -0.004620884731411934, "memory(GiB)": 67.84, "reward": 0.4543646454811096, "reward_std": 0.42410468459129336, "rewards/MultiModalAccuracyORM_Any/mean": 0.25, "rewards/MultiModalAccuracyORM_Any/std": 0.43348987102508546, "rewards/ReportKG_Jaccard/mean": 0.20436465442180635, "rewards/ReportKG_Jaccard/std": 0.03936793804168701, "step": 160, "train_speed(iter/s)": 0.045304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.0, "completions/mean_length": 79.675, "completions/min_length": 54.2, "epoch": 0.03333333333333333, "grad_norm": 1.2303993701934814, "kl": 0.005588589143007994, "learning_rate": 6.666666666666666e-07, "loss": -0.04399003386497498, "memory(GiB)": 67.84, "reward": 0.5342954367399215, "reward_std": 0.47608867287635803, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.46628902554512025, "rewards/ReportKG_Jaccard/mean": 0.13429542854428292, "rewards/ReportKG_Jaccard/std": 0.0481293823570013, "step": 165, "train_speed(iter/s)": 0.045362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.6, "completions/mean_length": 73.325, "completions/min_length": 51.8, "epoch": 0.03434343434343434, "grad_norm": 1.2199978828430176, "kl": 0.004752499982714653, "learning_rate": 6.868686868686868e-07, "loss": -0.06372400522232055, "memory(GiB)": 67.84, "reward": 0.7415132939815521, "reward_std": 0.416855251789093, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3703280448913574, "rewards/ReportKG_Jaccard/mean": 0.14151330590248107, "rewards/ReportKG_Jaccard/std": 0.06306667476892472, "step": 170, "train_speed(iter/s)": 0.045188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 110.6, "completions/mean_length": 84.375, "completions/min_length": 64.2, "epoch": 0.03535353535353535, "grad_norm": 1.3579285144805908, "kl": 0.004577062278985977, "learning_rate": 7.07070707070707e-07, "loss": -0.01734355241060257, "memory(GiB)": 67.84, "reward": 0.43975071609020233, "reward_std": 0.3448092743754387, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.30299633741378784, "rewards/ReportKG_Jaccard/mean": 0.16475071758031845, "rewards/ReportKG_Jaccard/std": 0.050760917365550995, "step": 175, "train_speed(iter/s)": 0.045236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.8, "completions/mean_length": 74.4, "completions/min_length": 50.0, "epoch": 0.03636363636363636, "grad_norm": 1.4275375604629517, "kl": 0.004746781755238771, "learning_rate": 7.272727272727272e-07, "loss": -0.06294171214103698, "memory(GiB)": 67.84, "reward": 0.5843373596668243, "reward_std": 0.4517862617969513, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.3921836853027344, "rewards/ReportKG_Jaccard/mean": 0.18433734774589539, "rewards/ReportKG_Jaccard/std": 0.07401845306158066, "step": 180, "train_speed(iter/s)": 0.045287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.8, "completions/mean_length": 74.45, "completions/min_length": 57.0, "epoch": 0.03737373737373737, "grad_norm": 1.0601800680160522, "kl": 0.004807179979979992, "learning_rate": 7.474747474747475e-07, "loss": -0.02363591343164444, "memory(GiB)": 67.84, "reward": 0.799380874633789, "reward_std": 0.4538723349571228, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.2743808627128601, "rewards/ReportKG_Jaccard/std": 0.06433583498001098, "step": 185, "train_speed(iter/s)": 0.045364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.2, "completions/mean_length": 86.05, "completions/min_length": 64.2, "epoch": 0.03838383838383838, "grad_norm": 1.7043582201004028, "kl": 0.005028474424034357, "learning_rate": 7.676767676767675e-07, "loss": -0.04728108048439026, "memory(GiB)": 67.84, "reward": 0.8274551868438721, "reward_std": 0.46148212552070617, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.3024551957845688, "rewards/ReportKG_Jaccard/std": 0.06993569321930408, "step": 190, "train_speed(iter/s)": 0.045509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.4, "completions/mean_length": 76.15, "completions/min_length": 56.4, "epoch": 0.03939393939393939, "grad_norm": 1.5408376455307007, "kl": 0.00599122098647058, "learning_rate": 7.878787878787878e-07, "loss": -0.023171786963939667, "memory(GiB)": 67.84, "reward": 0.7038525342941284, "reward_std": 0.5420243680477143, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.5024828433990478, "rewards/ReportKG_Jaccard/mean": 0.22885252833366393, "rewards/ReportKG_Jaccard/std": 0.0638948630541563, "step": 195, "train_speed(iter/s)": 0.045704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.4, "completions/mean_length": 79.175, "completions/min_length": 58.2, "epoch": 0.04040404040404041, "grad_norm": 1.3374706506729126, "kl": 0.005496765580028295, "learning_rate": 8.08080808080808e-07, "loss": -0.010852257907390594, "memory(GiB)": 67.84, "reward": 0.3031422942876816, "reward_std": 0.2880155712366104, "rewards/MultiModalAccuracyORM_Any/mean": 0.125, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.17814228162169457, "rewards/ReportKG_Jaccard/std": 0.058440587297081945, "step": 200, "train_speed(iter/s)": 0.045417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.8, "completions/mean_length": 75.075, "completions/min_length": 55.6, "epoch": 0.04141414141414142, "grad_norm": 1.160201907157898, "kl": 0.006668698694556952, "learning_rate": 8.282828282828283e-07, "loss": -0.0520630419254303, "memory(GiB)": 67.84, "reward": 0.5198905974626541, "reward_std": 0.3439183235168457, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.16989062279462813, "rewards/ReportKG_Jaccard/std": 0.06079721841961146, "step": 205, "train_speed(iter/s)": 0.045573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.4, "completions/mean_length": 68.0, "completions/min_length": 56.6, "epoch": 0.04242424242424243, "grad_norm": 1.4960246086120605, "kl": 0.007450410071760416, "learning_rate": 8.484848484848484e-07, "loss": 0.057378602027893064, "memory(GiB)": 67.84, "reward": 0.8650216817855835, "reward_std": 0.48818498849868774, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.290021687746048, "rewards/ReportKG_Jaccard/std": 0.15611103251576425, "step": 210, "train_speed(iter/s)": 0.045793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.8, "completions/mean_length": 68.3, "completions/min_length": 50.4, "epoch": 0.043434343434343436, "grad_norm": 1.2443673610687256, "kl": 0.004958447767421603, "learning_rate": 8.686868686868687e-07, "loss": 0.028581443428993224, "memory(GiB)": 67.84, "reward": 0.6158412218093872, "reward_std": 0.5073486030101776, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.4553455114364624, "rewards/ReportKG_Jaccard/mean": 0.21584123522043228, "rewards/ReportKG_Jaccard/std": 0.07397108487784862, "step": 215, "train_speed(iter/s)": 0.045895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 76.65, "completions/min_length": 61.6, "epoch": 0.044444444444444446, "grad_norm": 1.5659555196762085, "kl": 0.0065505016595125195, "learning_rate": 8.888888888888888e-07, "loss": 0.04913180470466614, "memory(GiB)": 67.84, "reward": 0.5773215413093566, "reward_std": 0.41430501341819764, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.3863525390625, "rewards/ReportKG_Jaccard/mean": 0.2023215502500534, "rewards/ReportKG_Jaccard/std": 0.051386597007513045, "step": 220, "train_speed(iter/s)": 0.045979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 120.0, "completions/mean_length": 86.65, "completions/min_length": 59.8, "epoch": 0.045454545454545456, "grad_norm": 1.5271772146224976, "kl": 0.007534209825098515, "learning_rate": 9.09090909090909e-07, "loss": -0.014577154815196992, "memory(GiB)": 67.84, "reward": 0.7346525311470031, "reward_std": 0.4957533717155457, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.4847656965255737, "rewards/ReportKG_Jaccard/mean": 0.13465254083275796, "rewards/ReportKG_Jaccard/std": 0.04629776701331138, "step": 225, "train_speed(iter/s)": 0.045918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.2, "completions/mean_length": 65.0, "completions/min_length": 49.4, "epoch": 0.046464646464646465, "grad_norm": 1.4512174129486084, "kl": 0.00884429169818759, "learning_rate": 9.292929292929292e-07, "loss": 0.01394592523574829, "memory(GiB)": 67.84, "reward": 0.619159996509552, "reward_std": 0.5392500877380371, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.49908818006515504, "rewards/ReportKG_Jaccard/mean": 0.16915997639298438, "rewards/ReportKG_Jaccard/std": 0.056121711432933805, "step": 230, "train_speed(iter/s)": 0.046043 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.2, "completions/mean_length": 69.575, "completions/min_length": 49.4, "epoch": 0.047474747474747475, "grad_norm": 1.2636798620224, "kl": 0.006319958902895451, "learning_rate": 9.494949494949495e-07, "loss": -0.03671115040779114, "memory(GiB)": 67.84, "reward": 0.535284548997879, "reward_std": 0.3608466908335686, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.30639100074768066, "rewards/ReportKG_Jaccard/mean": 0.18528454899787902, "rewards/ReportKG_Jaccard/std": 0.06635510623455047, "step": 235, "train_speed(iter/s)": 0.046081 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 79.15, "completions/min_length": 61.4, "epoch": 0.048484848484848485, "grad_norm": 0.6950473189353943, "kl": 0.007030707178637385, "learning_rate": 9.696969696969698e-07, "loss": -0.01308804452419281, "memory(GiB)": 67.84, "reward": 0.9890992522239686, "reward_std": 0.37263135313987733, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.3552303433418274, "rewards/ReportKG_Jaccard/mean": 0.23909923881292344, "rewards/ReportKG_Jaccard/std": 0.057095546275377274, "step": 240, "train_speed(iter/s)": 0.045978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.2, "completions/mean_length": 76.925, "completions/min_length": 54.0, "epoch": 0.049494949494949494, "grad_norm": 1.7734102010726929, "kl": 0.008498911000788211, "learning_rate": 9.898989898989898e-07, "loss": 0.013768544793128968, "memory(GiB)": 67.84, "reward": 0.7369955360889435, "reward_std": 0.4786779940128326, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.21199552863836288, "rewards/ReportKG_Jaccard/std": 0.07267875112593174, "step": 245, "train_speed(iter/s)": 0.045831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.4, "completions/mean_length": 75.925, "completions/min_length": 58.4, "epoch": 0.050505050505050504, "grad_norm": 1.0565730333328247, "kl": 0.007195144332945347, "learning_rate": 1.01010101010101e-06, "loss": -0.07560399174690247, "memory(GiB)": 67.84, "reward": 0.8613974690437317, "reward_std": 0.3568786710500717, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.21139744073152542, "rewards/ReportKG_Jaccard/std": 0.05264811515808106, "step": 250, "train_speed(iter/s)": 0.045682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.6, "completions/mean_length": 82.6, "completions/min_length": 59.8, "epoch": 0.051515151515151514, "grad_norm": 0.9875420928001404, "kl": 0.007138081220909953, "learning_rate": 1.0303030303030302e-06, "loss": -0.051291102170944215, "memory(GiB)": 67.84, "reward": 0.9279730677604675, "reward_std": 0.4111753046512604, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.17797307819128036, "rewards/ReportKG_Jaccard/std": 0.0487345140427351, "step": 255, "train_speed(iter/s)": 0.045757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.4, "completions/mean_length": 72.8, "completions/min_length": 52.0, "epoch": 0.052525252525252523, "grad_norm": 1.5926034450531006, "kl": 0.006015518493950367, "learning_rate": 1.0505050505050506e-06, "loss": -0.0005891650915145874, "memory(GiB)": 67.84, "reward": 0.3983414888381958, "reward_std": 0.3761130541563034, "rewards/MultiModalAccuracyORM_Any/mean": 0.2, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.19834147691726683, "rewards/ReportKG_Jaccard/std": 0.057343654334545135, "step": 260, "train_speed(iter/s)": 0.045791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.8, "completions/mean_length": 68.2, "completions/min_length": 50.2, "epoch": 0.05353535353535353, "grad_norm": 1.2992042303085327, "kl": 0.010935452673584223, "learning_rate": 1.0707070707070707e-06, "loss": 0.05552498698234558, "memory(GiB)": 67.84, "reward": 0.6421244382858277, "reward_std": 0.4713945686817169, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.4696836888790131, "rewards/ReportKG_Jaccard/mean": 0.1671244189143181, "rewards/ReportKG_Jaccard/std": 0.06642995476722717, "step": 265, "train_speed(iter/s)": 0.045795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.8, "completions/mean_length": 70.575, "completions/min_length": 57.6, "epoch": 0.05454545454545454, "grad_norm": 1.4619072675704956, "kl": 0.008996676746755838, "learning_rate": 1.0909090909090908e-06, "loss": 0.018508487939834596, "memory(GiB)": 67.84, "reward": 0.9034430146217346, "reward_std": 0.45509650111198424, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.22844301760196686, "rewards/ReportKG_Jaccard/std": 0.06250546276569366, "step": 270, "train_speed(iter/s)": 0.045588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.4, "completions/mean_length": 71.925, "completions/min_length": 49.6, "epoch": 0.05555555555555555, "grad_norm": 1.511723518371582, "kl": 0.007904832530766726, "learning_rate": 1.111111111111111e-06, "loss": -0.03079437017440796, "memory(GiB)": 67.84, "reward": 0.820904552936554, "reward_std": 0.5233882665634155, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.19590454995632173, "rewards/ReportKG_Jaccard/std": 0.07171339318156242, "step": 275, "train_speed(iter/s)": 0.045692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 72.275, "completions/min_length": 51.0, "epoch": 0.05656565656565657, "grad_norm": 1.5283451080322266, "kl": 0.008483657706528902, "learning_rate": 1.1313131313131313e-06, "loss": 0.053810489177703855, "memory(GiB)": 67.84, "reward": 0.9317226886749268, "reward_std": 0.36633925288915636, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.20672268867492677, "rewards/ReportKG_Jaccard/std": 0.06321907937526702, "step": 280, "train_speed(iter/s)": 0.045795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.0, "completions/mean_length": 82.075, "completions/min_length": 55.0, "epoch": 0.05757575757575758, "grad_norm": 1.2000826597213745, "kl": 0.011320959217846394, "learning_rate": 1.1515151515151516e-06, "loss": -0.026404416561126708, "memory(GiB)": 67.84, "reward": 0.7443005859851837, "reward_std": 0.48631473779678347, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.4629100561141968, "rewards/ReportKG_Jaccard/mean": 0.194300577044487, "rewards/ReportKG_Jaccard/std": 0.059906260669231416, "step": 285, "train_speed(iter/s)": 0.045493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.8, "completions/mean_length": 68.325, "completions/min_length": 46.0, "epoch": 0.05858585858585859, "grad_norm": 1.6559638977050781, "kl": 0.009746827371418475, "learning_rate": 1.1717171717171716e-06, "loss": 0.010855833441019059, "memory(GiB)": 67.84, "reward": 0.7213695704936981, "reward_std": 0.4770206451416016, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.1463695779442787, "rewards/ReportKG_Jaccard/std": 0.05148925855755806, "step": 290, "train_speed(iter/s)": 0.045416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.2, "completions/mean_length": 77.575, "completions/min_length": 63.4, "epoch": 0.0595959595959596, "grad_norm": 1.6580570936203003, "kl": 0.011821372993290424, "learning_rate": 1.1919191919191917e-06, "loss": 0.0036185789853334428, "memory(GiB)": 67.84, "reward": 0.9257219910621644, "reward_std": 0.4539998210966587, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.38124017119407655, "rewards/ReportKG_Jaccard/mean": 0.22572200298309325, "rewards/ReportKG_Jaccard/std": 0.11523594036698341, "step": 295, "train_speed(iter/s)": 0.045261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/mean_length": 67.575, "completions/min_length": 48.4, "epoch": 0.06060606060606061, "grad_norm": 2.030449390411377, "kl": 0.009892181027680635, "learning_rate": 1.2121212121212122e-06, "loss": -0.009582491219043731, "memory(GiB)": 67.84, "reward": 0.6026742339134217, "reward_std": 0.49124762415885925, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.4478123545646667, "rewards/ReportKG_Jaccard/mean": 0.1526742085814476, "rewards/ReportKG_Jaccard/std": 0.06047110147774219, "step": 300, "train_speed(iter/s)": 0.04503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.2, "completions/mean_length": 74.6, "completions/min_length": 52.4, "epoch": 0.06161616161616162, "grad_norm": 1.5157526731491089, "kl": 0.020055259950459002, "learning_rate": 1.2323232323232323e-06, "loss": 0.06447743773460388, "memory(GiB)": 67.84, "reward": 0.8588416814804077, "reward_std": 0.4532589942216873, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.40650616884231566, "rewards/ReportKG_Jaccard/mean": 0.20884168967604638, "rewards/ReportKG_Jaccard/std": 0.06314805373549462, "step": 305, "train_speed(iter/s)": 0.045017 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.0, "completions/mean_length": 73.1, "completions/min_length": 49.2, "epoch": 0.06262626262626263, "grad_norm": 1.439287543296814, "kl": 0.014832521602511407, "learning_rate": 1.2525252525252525e-06, "loss": -0.031109848618507387, "memory(GiB)": 67.84, "reward": 0.9929809093475341, "reward_std": 0.3400525197386742, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.19298094809055327, "rewards/ReportKG_Jaccard/std": 0.06219401806592941, "step": 310, "train_speed(iter/s)": 0.044813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 74.275, "completions/min_length": 56.2, "epoch": 0.06363636363636363, "grad_norm": 1.5303009748458862, "kl": 0.009115961752831936, "learning_rate": 1.2727272727272726e-06, "loss": -0.004865527153015137, "memory(GiB)": 67.84, "reward": 0.6936949133872986, "reward_std": 0.42324295490980146, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.38463483452796937, "rewards/ReportKG_Jaccard/mean": 0.1686949238181114, "rewards/ReportKG_Jaccard/std": 0.08245058655738831, "step": 315, "train_speed(iter/s)": 0.044575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.2, "completions/mean_length": 75.15, "completions/min_length": 54.8, "epoch": 0.06464646464646465, "grad_norm": 1.4972658157348633, "kl": 0.014789614547044038, "learning_rate": 1.292929292929293e-06, "loss": 0.013470734655857085, "memory(GiB)": 67.84, "reward": 1.0110676765441895, "reward_std": 0.4216969072818756, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.39729605317115785, "rewards/ReportKG_Jaccard/mean": 0.1860676735639572, "rewards/ReportKG_Jaccard/std": 0.07991523668169975, "step": 320, "train_speed(iter/s)": 0.044353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.8, "completions/mean_length": 72.275, "completions/min_length": 55.2, "epoch": 0.06565656565656566, "grad_norm": 1.622942328453064, "kl": 0.006109606102108955, "learning_rate": 1.3131313131313131e-06, "loss": 0.037330815196037294, "memory(GiB)": 67.84, "reward": 0.306291201710701, "reward_std": 0.33467262536287307, "rewards/MultiModalAccuracyORM_Any/mean": 0.125, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.18129119779914618, "rewards/ReportKG_Jaccard/std": 0.04343459643423557, "step": 325, "train_speed(iter/s)": 0.044352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.4, "completions/mean_length": 68.4, "completions/min_length": 53.6, "epoch": 0.06666666666666667, "grad_norm": 1.2544864416122437, "kl": 0.015247172955423594, "learning_rate": 1.3333333333333332e-06, "loss": 0.07467058897018433, "memory(GiB)": 67.84, "reward": 0.6326038241386414, "reward_std": 0.4118665993213654, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.15760384649038314, "rewards/ReportKG_Jaccard/std": 0.07309683412313461, "step": 330, "train_speed(iter/s)": 0.044389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.6, "completions/mean_length": 68.9, "completions/min_length": 47.4, "epoch": 0.06767676767676768, "grad_norm": 1.2737764120101929, "kl": 0.008824151568114757, "learning_rate": 1.3535353535353535e-06, "loss": -0.008112408220767975, "memory(GiB)": 67.84, "reward": 0.6764769047498703, "reward_std": 0.33589830100536344, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.22647689431905746, "rewards/ReportKG_Jaccard/std": 0.09705003350973129, "step": 335, "train_speed(iter/s)": 0.04437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 74.875, "completions/min_length": 52.6, "epoch": 0.06868686868686869, "grad_norm": 1.1448469161987305, "kl": 0.0085934117436409, "learning_rate": 1.3737373737373735e-06, "loss": 0.026493942737579344, "memory(GiB)": 67.84, "reward": 1.0039404153823852, "reward_std": 0.22082052230834961, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.1539404034614563, "rewards/ReportKG_Jaccard/std": 0.05587775148451328, "step": 340, "train_speed(iter/s)": 0.044382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 79.2, "completions/mean_length": 63.975, "completions/min_length": 49.0, "epoch": 0.0696969696969697, "grad_norm": 1.372779369354248, "kl": 0.007844180986285209, "learning_rate": 1.393939393939394e-06, "loss": -0.011814530193805694, "memory(GiB)": 67.84, "reward": 1.0010760426521301, "reward_std": 0.43066564947366714, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.3771016776561737, "rewards/ReportKG_Jaccard/mean": 0.2760760337114334, "rewards/ReportKG_Jaccard/std": 0.07701248526573182, "step": 345, "train_speed(iter/s)": 0.044474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.2, "completions/mean_length": 72.475, "completions/min_length": 54.8, "epoch": 0.0707070707070707, "grad_norm": 0.9336965084075928, "kl": 0.01051368284970522, "learning_rate": 1.414141414141414e-06, "loss": 0.006796014308929443, "memory(GiB)": 67.84, "reward": 1.065207266807556, "reward_std": 0.2859183639287949, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2652072995901108, "rewards/ReportKG_Jaccard/std": 0.07207776792347431, "step": 350, "train_speed(iter/s)": 0.044464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.0, "completions/mean_length": 68.8, "completions/min_length": 52.8, "epoch": 0.07171717171717172, "grad_norm": 1.165823221206665, "kl": 0.012377941515296698, "learning_rate": 1.4343434343434341e-06, "loss": 0.004579921811819076, "memory(GiB)": 67.84, "reward": 0.7398708865046502, "reward_std": 0.36421899050474166, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.31392415761947634, "rewards/ReportKG_Jaccard/mean": 0.23987089097499847, "rewards/ReportKG_Jaccard/std": 0.10114078894257546, "step": 355, "train_speed(iter/s)": 0.044571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 113.2, "completions/mean_length": 79.475, "completions/min_length": 58.8, "epoch": 0.07272727272727272, "grad_norm": 1.5787031650543213, "kl": 0.011791917402297259, "learning_rate": 1.4545454545454544e-06, "loss": 0.010196195542812347, "memory(GiB)": 68.5, "reward": 0.8186060786247253, "reward_std": 0.3987011253833771, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3897472023963928, "rewards/ReportKG_Jaccard/mean": 0.16860606372356415, "rewards/ReportKG_Jaccard/std": 0.0661143098026514, "step": 360, "train_speed(iter/s)": 0.04455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/mean_length": 74.45, "completions/min_length": 57.8, "epoch": 0.07373737373737374, "grad_norm": 1.5776063203811646, "kl": 0.008922658022493124, "learning_rate": 1.4747474747474747e-06, "loss": -0.011032012850046157, "memory(GiB)": 68.5, "reward": 1.0694725275039674, "reward_std": 0.2378984048962593, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.2944725424051285, "rewards/ReportKG_Jaccard/std": 0.09249584674835205, "step": 365, "train_speed(iter/s)": 0.044424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.2, "completions/mean_length": 77.45, "completions/min_length": 59.4, "epoch": 0.07474747474747474, "grad_norm": 1.6141842603683472, "kl": 0.007463734038174153, "learning_rate": 1.494949494949495e-06, "loss": -0.041665691137313846, "memory(GiB)": 68.5, "reward": 0.6952153265476226, "reward_std": 0.4718452215194702, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.2202153116464615, "rewards/ReportKG_Jaccard/std": 0.05260085612535477, "step": 370, "train_speed(iter/s)": 0.044341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.2, "completions/mean_length": 85.975, "completions/min_length": 67.0, "epoch": 0.07575757575757576, "grad_norm": 1.2355207204818726, "kl": 0.009170501679182052, "learning_rate": 1.515151515151515e-06, "loss": 0.014372843503952026, "memory(GiB)": 68.5, "reward": 0.47791164815425874, "reward_std": 0.2267471507191658, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.1776151716709137, "rewards/ReportKG_Jaccard/mean": 0.15291164815425873, "rewards/ReportKG_Jaccard/std": 0.054336291924119, "step": 375, "train_speed(iter/s)": 0.044186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.4, "completions/mean_length": 75.625, "completions/min_length": 54.6, "epoch": 0.07676767676767676, "grad_norm": 1.4212702512741089, "kl": 0.00809121634811163, "learning_rate": 1.535353535353535e-06, "loss": -0.0266552597284317, "memory(GiB)": 68.5, "reward": 0.6479459822177887, "reward_std": 0.344845400005579, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.22294600307941437, "rewards/ReportKG_Jaccard/std": 0.07651115134358406, "step": 380, "train_speed(iter/s)": 0.044218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.6, "completions/mean_length": 70.975, "completions/min_length": 51.4, "epoch": 0.07777777777777778, "grad_norm": 1.4586416482925415, "kl": 0.012798486649990082, "learning_rate": 1.5555555555555556e-06, "loss": -0.004399623349308967, "memory(GiB)": 68.5, "reward": 0.9514293551445008, "reward_std": 0.41532640904188156, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.1764293745160103, "rewards/ReportKG_Jaccard/std": 0.07010012865066528, "step": 385, "train_speed(iter/s)": 0.044286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.2, "completions/mean_length": 76.225, "completions/min_length": 55.4, "epoch": 0.07878787878787878, "grad_norm": 1.6041502952575684, "kl": 0.0181792126968503, "learning_rate": 1.5757575757575756e-06, "loss": 0.013462892174720765, "memory(GiB)": 68.5, "reward": 1.154429566860199, "reward_std": 0.3112015396356583, "rewards/MultiModalAccuracyORM_Any/mean": 0.9, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.25442960299551487, "rewards/ReportKG_Jaccard/std": 0.05581766553223133, "step": 390, "train_speed(iter/s)": 0.044285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.4, "completions/mean_length": 74.9, "completions/min_length": 56.0, "epoch": 0.0797979797979798, "grad_norm": 1.3520389795303345, "kl": 0.009351130668073892, "learning_rate": 1.595959595959596e-06, "loss": 0.0004954785108566284, "memory(GiB)": 68.5, "reward": 0.9120234727859498, "reward_std": 0.3934382706880569, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.2870235025882721, "rewards/ReportKG_Jaccard/std": 0.06985773667693138, "step": 395, "train_speed(iter/s)": 0.044297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.2, "completions/mean_length": 74.675, "completions/min_length": 58.0, "epoch": 0.08080808080808081, "grad_norm": 1.283292293548584, "kl": 0.014316144399344921, "learning_rate": 1.616161616161616e-06, "loss": 0.007212807238101959, "memory(GiB)": 68.5, "reward": 0.635532146692276, "reward_std": 0.4767256617546082, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.46628902554512025, "rewards/ReportKG_Jaccard/mean": 0.23553213253617286, "rewards/ReportKG_Jaccard/std": 0.05512661337852478, "step": 400, "train_speed(iter/s)": 0.044252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.2, "completions/mean_length": 78.85, "completions/min_length": 60.8, "epoch": 0.08181818181818182, "grad_norm": 1.268739938735962, "kl": 0.012672633677721024, "learning_rate": 1.6363636363636365e-06, "loss": 0.011344937980175019, "memory(GiB)": 68.5, "reward": 0.7116359770298004, "reward_std": 0.35158574432134626, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.18663596659898757, "rewards/ReportKG_Jaccard/std": 0.05809076130390167, "step": 405, "train_speed(iter/s)": 0.044198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.2, "completions/mean_length": 70.675, "completions/min_length": 55.6, "epoch": 0.08282828282828283, "grad_norm": 1.2463871240615845, "kl": 0.009623069502413272, "learning_rate": 1.6565656565656565e-06, "loss": 0.024917614459991456, "memory(GiB)": 68.5, "reward": 0.7948508620262146, "reward_std": 0.3385382480919361, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.1948508322238922, "rewards/ReportKG_Jaccard/std": 0.07528922408819198, "step": 410, "train_speed(iter/s)": 0.04424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 119.8, "completions/mean_length": 90.675, "completions/min_length": 69.6, "epoch": 0.08383838383838384, "grad_norm": 1.2212516069412231, "kl": 0.01083349548280239, "learning_rate": 1.6767676767676766e-06, "loss": 0.011212028563022614, "memory(GiB)": 68.5, "reward": 0.7624722480773926, "reward_std": 0.39818829447031023, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3771016776561737, "rewards/ReportKG_Jaccard/mean": 0.13747224435210229, "rewards/ReportKG_Jaccard/std": 0.037715214490890506, "step": 415, "train_speed(iter/s)": 0.044221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 71.75, "completions/min_length": 56.4, "epoch": 0.08484848484848485, "grad_norm": 1.6007417440414429, "kl": 0.012163489684462547, "learning_rate": 1.6969696969696969e-06, "loss": -0.010317467898130418, "memory(GiB)": 68.5, "reward": 0.4745453968644142, "reward_std": 0.36906003952026367, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.3484567105770111, "rewards/ReportKG_Jaccard/mean": 0.19954538494348525, "rewards/ReportKG_Jaccard/std": 0.04027755819261074, "step": 420, "train_speed(iter/s)": 0.04415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 132.0, "completions/mean_length": 89.975, "completions/min_length": 67.2, "epoch": 0.08585858585858586, "grad_norm": 1.3423380851745605, "kl": 0.01130115706473589, "learning_rate": 1.7171717171717171e-06, "loss": -0.01792941242456436, "memory(GiB)": 68.5, "reward": 0.7114500284194947, "reward_std": 0.4903077006340027, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.4847656965255737, "rewards/ReportKG_Jaccard/mean": 0.21144999861717223, "rewards/ReportKG_Jaccard/std": 0.0670079156756401, "step": 425, "train_speed(iter/s)": 0.043951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.0, "completions/mean_length": 68.6, "completions/min_length": 52.2, "epoch": 0.08686868686868687, "grad_norm": 1.1786645650863647, "kl": 0.01642163973301649, "learning_rate": 1.7373737373737374e-06, "loss": -0.0032656557857990267, "memory(GiB)": 68.5, "reward": 0.7808006703853607, "reward_std": 0.5010292410850525, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.46628902554512025, "rewards/ReportKG_Jaccard/mean": 0.3308006703853607, "rewards/ReportKG_Jaccard/std": 0.08514512330293655, "step": 430, "train_speed(iter/s)": 0.043937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 78.55, "completions/min_length": 56.8, "epoch": 0.08787878787878788, "grad_norm": 0.709911048412323, "kl": 0.01961328275501728, "learning_rate": 1.7575757575757575e-06, "loss": 0.010677720606327056, "memory(GiB)": 68.5, "reward": 0.8852561831474304, "reward_std": 0.38432696759700774, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.18525619059801102, "rewards/ReportKG_Jaccard/std": 0.05853230021893978, "step": 435, "train_speed(iter/s)": 0.043882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/mean_length": 69.225, "completions/min_length": 52.0, "epoch": 0.08888888888888889, "grad_norm": 1.3436964750289917, "kl": 0.011205872148275375, "learning_rate": 1.7777777777777775e-06, "loss": 0.024135738611221313, "memory(GiB)": 68.5, "reward": 0.7633623123168946, "reward_std": 0.42044240683317186, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.18836229890584946, "rewards/ReportKG_Jaccard/std": 0.07588322684168816, "step": 440, "train_speed(iter/s)": 0.043703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.6, "completions/mean_length": 79.875, "completions/min_length": 63.4, "epoch": 0.0898989898989899, "grad_norm": 0.9094604253768921, "kl": 0.0117964718490839, "learning_rate": 1.797979797979798e-06, "loss": 0.010618197917938232, "memory(GiB)": 68.5, "reward": 0.8341615170240402, "reward_std": 0.3731063447892666, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.2591615289449692, "rewards/ReportKG_Jaccard/std": 0.07502427101135253, "step": 445, "train_speed(iter/s)": 0.043516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.6, "completions/mean_length": 76.45, "completions/min_length": 58.6, "epoch": 0.09090909090909091, "grad_norm": 1.5657342672348022, "kl": 0.016893143579363824, "learning_rate": 1.818181818181818e-06, "loss": 0.02128390669822693, "memory(GiB)": 68.5, "reward": 1.1006024718284606, "reward_std": 0.3616612687706947, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.25060250163078307, "rewards/ReportKG_Jaccard/std": 0.06801890730857849, "step": 450, "train_speed(iter/s)": 0.043514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.6, "completions/mean_length": 73.825, "completions/min_length": 52.4, "epoch": 0.09191919191919191, "grad_norm": 1.1480644941329956, "kl": 0.01730834860354662, "learning_rate": 1.8383838383838384e-06, "loss": -0.015753789246082305, "memory(GiB)": 68.5, "reward": 0.6321974992752075, "reward_std": 0.34083566144108773, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.1821975141763687, "rewards/ReportKG_Jaccard/std": 0.06322055906057358, "step": 455, "train_speed(iter/s)": 0.043371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.0, "completions/mean_length": 63.5, "completions/min_length": 46.6, "epoch": 0.09292929292929293, "grad_norm": 1.5945450067520142, "kl": 0.012315271608531474, "learning_rate": 1.8585858585858584e-06, "loss": 0.0402170866727829, "memory(GiB)": 68.5, "reward": 0.969142061471939, "reward_std": 0.316096830368042, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.2941420555114746, "rewards/ReportKG_Jaccard/std": 0.10669446140527725, "step": 460, "train_speed(iter/s)": 0.043364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.6, "completions/mean_length": 67.85, "completions/min_length": 51.4, "epoch": 0.09393939393939393, "grad_norm": 1.4903204441070557, "kl": 0.013215941563248634, "learning_rate": 1.878787878787879e-06, "loss": 0.017053934931755065, "memory(GiB)": 68.5, "reward": 0.9456157147884369, "reward_std": 0.32966938316822053, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.22061570286750792, "rewards/ReportKG_Jaccard/std": 0.07402674034237862, "step": 465, "train_speed(iter/s)": 0.043342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 142.2, "completions/mean_length": 94.95, "completions/min_length": 69.8, "epoch": 0.09494949494949495, "grad_norm": 1.2871638536453247, "kl": 0.017136347852647303, "learning_rate": 1.898989898989899e-06, "loss": -0.005767977237701416, "memory(GiB)": 68.5, "reward": 0.8130840510129929, "reward_std": 0.4465097010135651, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.41916738748550414, "rewards/ReportKG_Jaccard/mean": 0.16308404356241227, "rewards/ReportKG_Jaccard/std": 0.07541857697069645, "step": 470, "train_speed(iter/s)": 0.043135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.8, "completions/mean_length": 68.5, "completions/min_length": 52.4, "epoch": 0.09595959595959595, "grad_norm": 1.4094610214233398, "kl": 0.014102161582559346, "learning_rate": 1.9191919191919192e-06, "loss": -0.04260334372520447, "memory(GiB)": 68.5, "reward": 0.84809210896492, "reward_std": 0.36215881556272506, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.24809210896492004, "rewards/ReportKG_Jaccard/std": 0.07000579982995987, "step": 475, "train_speed(iter/s)": 0.043147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.8, "completions/mean_length": 69.775, "completions/min_length": 58.2, "epoch": 0.09696969696969697, "grad_norm": 1.9130076169967651, "kl": 0.014109329134225846, "learning_rate": 1.9393939393939395e-06, "loss": 0.029845941066741943, "memory(GiB)": 68.5, "reward": 1.0423033952713012, "reward_std": 0.48346521258354186, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.41916738748550414, "rewards/ReportKG_Jaccard/mean": 0.24230343699455262, "rewards/ReportKG_Jaccard/std": 0.09260830730199814, "step": 480, "train_speed(iter/s)": 0.04313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.2, "completions/mean_length": 69.45, "completions/min_length": 51.0, "epoch": 0.09797979797979799, "grad_norm": 1.0361273288726807, "kl": 0.017884697299450636, "learning_rate": 1.9595959595959594e-06, "loss": 0.059948956966400145, "memory(GiB)": 68.5, "reward": 0.6483400791883469, "reward_std": 0.3792906180024147, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.1983400523662567, "rewards/ReportKG_Jaccard/std": 0.07682887203991413, "step": 485, "train_speed(iter/s)": 0.043149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.2, "completions/mean_length": 62.475, "completions/min_length": 48.4, "epoch": 0.09898989898989899, "grad_norm": 1.4910826683044434, "kl": 0.01952340630814433, "learning_rate": 1.9797979797979796e-06, "loss": 0.1147850751876831, "memory(GiB)": 68.5, "reward": 0.8540554702281952, "reward_std": 0.33066306039690974, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.22905551195144652, "rewards/ReportKG_Jaccard/std": 0.05686618536710739, "step": 490, "train_speed(iter/s)": 0.043213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.4, "completions/mean_length": 71.65, "completions/min_length": 60.4, "epoch": 0.1, "grad_norm": 1.3144333362579346, "kl": 0.015216668136417866, "learning_rate": 2e-06, "loss": 0.007929515093564987, "memory(GiB)": 68.5, "reward": 0.9520285248756408, "reward_std": 0.32122486680746076, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2770285278558731, "rewards/ReportKG_Jaccard/std": 0.1013006791472435, "step": 495, "train_speed(iter/s)": 0.043284 }, { "epoch": 0.10101010101010101, "grad_norm": 1.2611877918243408, "learning_rate": 2.02020202020202e-06, "loss": 0.027586257457733153, "memory(GiB)": 68.5, "step": 500, "train_speed(iter/s)": 0.043329 }, { "epoch": 0.10101010101010101, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 98.02, "eval_completions/mean_length": 75.695, "eval_completions/min_length": 56.98, "eval_kl": 0.017766487635672092, "eval_loss": 0.010875207372009754, "eval_reward": 0.7555562871694564, "eval_reward_std": 0.2712204258516431, "eval_rewards/MultiModalAccuracyORM_Any/mean": 0.5475, "eval_rewards/MultiModalAccuracyORM_Any/std": 0.22026692986488342, "eval_rewards/ReportKG_Jaccard/mean": 0.20805628418922426, "eval_rewards/ReportKG_Jaccard/std": 0.06257611729204654, "eval_runtime": 885.5549, "eval_samples_per_second": 0.056, "eval_steps_per_second": 0.008, "step": 500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.0, "completions/mean_length": 75.4125, "completions/min_length": 58.4, "epoch": 0.10202020202020202, "grad_norm": 1.3783272504806519, "kl": 0.012329869251698256, "learning_rate": 2.04040404040404e-06, "loss": 0.04108564853668213, "memory(GiB)": 68.5, "reward": 0.6551630452275277, "reward_std": 0.3645056612789631, "rewards/MultiModalAccuracyORM_Any/mean": 0.3625, "rewards/MultiModalAccuracyORM_Any/std": 0.2777381867170334, "rewards/ReportKG_Jaccard/mean": 0.292663049697876, "rewards/ReportKG_Jaccard/std": 0.09690524078905582, "step": 505, "train_speed(iter/s)": 0.03993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.4, "completions/mean_length": 74.075, "completions/min_length": 58.2, "epoch": 0.10303030303030303, "grad_norm": 1.0285346508026123, "kl": 0.012950331531465053, "learning_rate": 2.0606060606060603e-06, "loss": 0.031212010979652406, "memory(GiB)": 68.5, "reward": 0.8699927806854248, "reward_std": 0.482296359539032, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.38463483452796937, "rewards/ReportKG_Jaccard/mean": 0.29499282240867614, "rewards/ReportKG_Jaccard/std": 0.10993188843131066, "step": 510, "train_speed(iter/s)": 0.039871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.6, "completions/mean_length": 70.1, "completions/min_length": 50.2, "epoch": 0.10404040404040404, "grad_norm": 1.399632215499878, "kl": 0.026916974782943727, "learning_rate": 2.0808080808080806e-06, "loss": -0.030089327692985536, "memory(GiB)": 68.5, "reward": 0.7156638026237487, "reward_std": 0.381196653842926, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.14066379517316818, "rewards/ReportKG_Jaccard/std": 0.05714865028858185, "step": 515, "train_speed(iter/s)": 0.039866 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 79.0, "completions/mean_length": 63.7, "completions/min_length": 50.8, "epoch": 0.10505050505050505, "grad_norm": 1.4272738695144653, "kl": 0.02102918941527605, "learning_rate": 2.1010101010101013e-06, "loss": 0.01423012912273407, "memory(GiB)": 68.5, "reward": 0.9327169418334961, "reward_std": 0.23032859191298485, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.23271694481372834, "rewards/ReportKG_Jaccard/std": 0.08631590679287911, "step": 520, "train_speed(iter/s)": 0.039923 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.6, "completions/mean_length": 73.525, "completions/min_length": 50.8, "epoch": 0.10606060606060606, "grad_norm": 1.1396820545196533, "kl": 0.013854903168976307, "learning_rate": 2.121212121212121e-06, "loss": -0.04957354366779328, "memory(GiB)": 68.5, "reward": 0.7720336318016052, "reward_std": 0.42384730130434034, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.38463483452796937, "rewards/ReportKG_Jaccard/mean": 0.2470336377620697, "rewards/ReportKG_Jaccard/std": 0.06499501541256905, "step": 525, "train_speed(iter/s)": 0.040018 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.6, "completions/mean_length": 78.175, "completions/min_length": 57.2, "epoch": 0.10707070707070707, "grad_norm": 1.2834540605545044, "kl": 0.02072117179632187, "learning_rate": 2.1414141414141414e-06, "loss": 0.04628574550151825, "memory(GiB)": 68.5, "reward": 1.1345594644546508, "reward_std": 0.31080237925052645, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2595594435930252, "rewards/ReportKG_Jaccard/std": 0.07819253355264663, "step": 530, "train_speed(iter/s)": 0.040041 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.4, "completions/mean_length": 78.7, "completions/min_length": 59.6, "epoch": 0.10808080808080808, "grad_norm": 1.5838344097137451, "kl": 0.019289080053567886, "learning_rate": 2.1616161616161617e-06, "loss": 0.019858184456825256, "memory(GiB)": 68.5, "reward": 0.9605679392814637, "reward_std": 0.3256172209978104, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.2845196664333344, "rewards/ReportKG_Jaccard/mean": 0.18556794226169587, "rewards/ReportKG_Jaccard/std": 0.049742120504379275, "step": 535, "train_speed(iter/s)": 0.039863 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.8, "completions/mean_length": 66.825, "completions/min_length": 51.2, "epoch": 0.10909090909090909, "grad_norm": 1.1395540237426758, "kl": 0.026492477953433992, "learning_rate": 2.1818181818181815e-06, "loss": 0.04784111082553864, "memory(GiB)": 68.5, "reward": 0.840999636054039, "reward_std": 0.19488800168037415, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.190999598801136, "rewards/ReportKG_Jaccard/std": 0.07149516940116882, "step": 540, "train_speed(iter/s)": 0.039871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.2, "completions/mean_length": 78.725, "completions/min_length": 57.8, "epoch": 0.1101010101010101, "grad_norm": 1.3821678161621094, "kl": 0.015515591949224472, "learning_rate": 2.202020202020202e-06, "loss": -0.06505037546157837, "memory(GiB)": 68.5, "reward": 0.7116603493690491, "reward_std": 0.34045641869306564, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.299601674079895, "rewards/ReportKG_Jaccard/mean": 0.16166035160422326, "rewards/ReportKG_Jaccard/std": 0.06914588287472725, "step": 545, "train_speed(iter/s)": 0.039832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.6, "completions/mean_length": 86.725, "completions/min_length": 53.2, "epoch": 0.1111111111111111, "grad_norm": 1.3414802551269531, "kl": 0.01445132978260517, "learning_rate": 2.222222222222222e-06, "loss": -0.05957741141319275, "memory(GiB)": 68.5, "reward": 0.77498539686203, "reward_std": 0.1861722268164158, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.17498542070388795, "rewards/ReportKG_Jaccard/std": 0.06128131002187729, "step": 550, "train_speed(iter/s)": 0.039677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 72.275, "completions/min_length": 51.8, "epoch": 0.11212121212121212, "grad_norm": 1.643013834953308, "kl": 0.021524151600897313, "learning_rate": 2.242424242424242e-06, "loss": -0.053434067964553834, "memory(GiB)": 68.5, "reward": 0.5697046935558319, "reward_std": 0.32477950900793073, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.19470469653606415, "rewards/ReportKG_Jaccard/std": 0.0681751772761345, "step": 555, "train_speed(iter/s)": 0.039468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/mean_length": 72.475, "completions/min_length": 54.4, "epoch": 0.11313131313131314, "grad_norm": 1.597252368927002, "kl": 0.01781420726329088, "learning_rate": 2.2626262626262626e-06, "loss": 0.10537570714950562, "memory(GiB)": 68.5, "reward": 0.5618405520915986, "reward_std": 0.41160630583763125, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.16184055656194687, "rewards/ReportKG_Jaccard/std": 0.05089630521833897, "step": 560, "train_speed(iter/s)": 0.039443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.2, "completions/mean_length": 79.0, "completions/min_length": 60.4, "epoch": 0.11414141414141414, "grad_norm": 1.2429611682891846, "kl": 0.0185189263895154, "learning_rate": 2.282828282828283e-06, "loss": 0.015822988748550416, "memory(GiB)": 68.5, "reward": 0.47806061804294586, "reward_std": 0.31727105677127837, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.2886738538742065, "rewards/ReportKG_Jaccard/mean": 0.2030606150627136, "rewards/ReportKG_Jaccard/std": 0.050711911916732785, "step": 565, "train_speed(iter/s)": 0.039304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 68.925, "completions/min_length": 47.8, "epoch": 0.11515151515151516, "grad_norm": 1.3234635591506958, "kl": 0.028683018498122693, "learning_rate": 2.303030303030303e-06, "loss": -0.012895090878009796, "memory(GiB)": 68.5, "reward": 0.8986620217561722, "reward_std": 0.24391312450170516, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.19866201877593995, "rewards/ReportKG_Jaccard/std": 0.08320177122950553, "step": 570, "train_speed(iter/s)": 0.039379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.4, "completions/mean_length": 71.775, "completions/min_length": 53.0, "epoch": 0.11616161616161616, "grad_norm": 1.562609076499939, "kl": 0.0134876923635602, "learning_rate": 2.323232323232323e-06, "loss": 0.0391345739364624, "memory(GiB)": 68.5, "reward": 0.6829781323671341, "reward_std": 0.331473883241415, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.1579781301319599, "rewards/ReportKG_Jaccard/std": 0.14129109308123589, "step": 575, "train_speed(iter/s)": 0.039472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.2, "completions/mean_length": 67.525, "completions/min_length": 55.2, "epoch": 0.11717171717171718, "grad_norm": 1.1818801164627075, "kl": 0.024541821889579297, "learning_rate": 2.3434343434343433e-06, "loss": -0.01748192012310028, "memory(GiB)": 68.5, "reward": 0.7089490175247193, "reward_std": 0.3263598456978798, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.15894900858402253, "rewards/ReportKG_Jaccard/std": 0.08701385408639908, "step": 580, "train_speed(iter/s)": 0.039529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.0, "completions/mean_length": 74.975, "completions/min_length": 58.0, "epoch": 0.11818181818181818, "grad_norm": 1.5240614414215088, "kl": 0.012345336563885211, "learning_rate": 2.3636363636363636e-06, "loss": 0.09115396738052368, "memory(GiB)": 68.5, "reward": 1.218348789215088, "reward_std": 0.22589830607175826, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.2933487743139267, "rewards/ReportKG_Jaccard/std": 0.07546769082546234, "step": 585, "train_speed(iter/s)": 0.039457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.2, "completions/mean_length": 70.775, "completions/min_length": 54.6, "epoch": 0.1191919191919192, "grad_norm": 1.0132863521575928, "kl": 0.017941660620272158, "learning_rate": 2.3838383838383834e-06, "loss": 0.026334095001220702, "memory(GiB)": 68.5, "reward": 0.9205698728561401, "reward_std": 0.41876084804534913, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.2705698758363724, "rewards/ReportKG_Jaccard/std": 0.08034459576010704, "step": 590, "train_speed(iter/s)": 0.039486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 115.2, "completions/mean_length": 88.825, "completions/min_length": 61.0, "epoch": 0.1202020202020202, "grad_norm": 1.1069679260253906, "kl": 0.01885344982147217, "learning_rate": 2.4040404040404037e-06, "loss": -0.06920656561851501, "memory(GiB)": 68.5, "reward": 0.8695073843002319, "reward_std": 0.33051636815071106, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.21950736939907073, "rewards/ReportKG_Jaccard/std": 0.08180022314190864, "step": 595, "train_speed(iter/s)": 0.03944 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.8, "completions/mean_length": 68.775, "completions/min_length": 56.2, "epoch": 0.12121212121212122, "grad_norm": 1.5469014644622803, "kl": 0.03215445540845394, "learning_rate": 2.4242424242424244e-06, "loss": 0.015417435765266418, "memory(GiB)": 68.5, "reward": 1.0796879172325133, "reward_std": 0.3192516595125198, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.22968790233135222, "rewards/ReportKG_Jaccard/std": 0.07916812114417553, "step": 600, "train_speed(iter/s)": 0.039537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 70.7, "completions/min_length": 50.4, "epoch": 0.12222222222222222, "grad_norm": 1.204202651977539, "kl": 0.01442983914166689, "learning_rate": 2.4444444444444447e-06, "loss": -0.01380576491355896, "memory(GiB)": 68.5, "reward": 0.7425971567630768, "reward_std": 0.3699374124407768, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.29206851720809934, "rewards/ReportKG_Jaccard/mean": 0.24259715974330903, "rewards/ReportKG_Jaccard/std": 0.09798701629042625, "step": 605, "train_speed(iter/s)": 0.039437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 70.625, "completions/min_length": 54.2, "epoch": 0.12323232323232323, "grad_norm": 1.4368175268173218, "kl": 0.02509705126285553, "learning_rate": 2.4646464646464645e-06, "loss": 0.0491610586643219, "memory(GiB)": 68.5, "reward": 0.9101150274276734, "reward_std": 0.34763688668608667, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.30299633741378784, "rewards/ReportKG_Jaccard/mean": 0.23511501252651215, "rewards/ReportKG_Jaccard/std": 0.05546916946768761, "step": 610, "train_speed(iter/s)": 0.039464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.4, "completions/mean_length": 67.275, "completions/min_length": 48.2, "epoch": 0.12424242424242424, "grad_norm": 2.3844714164733887, "kl": 0.03806553483009338, "learning_rate": 2.4848484848484848e-06, "loss": 0.07591903805732728, "memory(GiB)": 68.5, "reward": 0.9154878199100495, "reward_std": 0.2871960043907166, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2154878072440624, "rewards/ReportKG_Jaccard/std": 0.06840017661452294, "step": 615, "train_speed(iter/s)": 0.039468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.4, "completions/mean_length": 67.125, "completions/min_length": 55.0, "epoch": 0.12525252525252525, "grad_norm": 0.9607030749320984, "kl": 0.014623469673097134, "learning_rate": 2.505050505050505e-06, "loss": 0.0348100334405899, "memory(GiB)": 68.5, "reward": 0.8676259458065033, "reward_std": 0.35620094537734986, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.2886738538742065, "rewards/ReportKG_Jaccard/mean": 0.2426259547472, "rewards/ReportKG_Jaccard/std": 0.07576442733407021, "step": 620, "train_speed(iter/s)": 0.039564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.6, "completions/mean_length": 67.6, "completions/min_length": 51.4, "epoch": 0.12626262626262627, "grad_norm": 1.4798163175582886, "kl": 0.03531459923833609, "learning_rate": 2.525252525252525e-06, "loss": 0.0495208203792572, "memory(GiB)": 68.5, "reward": 0.8919907629489898, "reward_std": 0.2285119764506817, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.19199078679084777, "rewards/ReportKG_Jaccard/std": 0.07070913910865784, "step": 625, "train_speed(iter/s)": 0.039609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.6, "completions/mean_length": 80.575, "completions/min_length": 53.2, "epoch": 0.12727272727272726, "grad_norm": 1.3770930767059326, "kl": 0.02741134762763977, "learning_rate": 2.545454545454545e-06, "loss": -0.027949297428131105, "memory(GiB)": 68.5, "reward": 0.8956069231033326, "reward_std": 0.36026850193738935, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.1706069067120552, "rewards/ReportKG_Jaccard/std": 0.05056754946708679, "step": 630, "train_speed(iter/s)": 0.039644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.4, "completions/mean_length": 60.35, "completions/min_length": 45.6, "epoch": 0.12828282828282828, "grad_norm": 1.7357592582702637, "kl": 0.030155269987881185, "learning_rate": 2.5656565656565655e-06, "loss": 0.13264509439468383, "memory(GiB)": 68.5, "reward": 0.7215390801429749, "reward_std": 0.263660255074501, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.1965390585362911, "rewards/ReportKG_Jaccard/std": 0.08039348535239696, "step": 635, "train_speed(iter/s)": 0.039556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.6, "completions/mean_length": 71.35, "completions/min_length": 58.0, "epoch": 0.1292929292929293, "grad_norm": 1.3951467275619507, "kl": 0.020271953381597995, "learning_rate": 2.585858585858586e-06, "loss": 0.016705265641212462, "memory(GiB)": 68.5, "reward": 1.2044912815093993, "reward_std": 0.37696859836578367, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.3294913023710251, "rewards/ReportKG_Jaccard/std": 0.09312350451946258, "step": 640, "train_speed(iter/s)": 0.039551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 70.5, "completions/min_length": 54.0, "epoch": 0.1303030303030303, "grad_norm": 1.963049292564392, "kl": 0.022393659502267838, "learning_rate": 2.606060606060606e-06, "loss": 0.078030264377594, "memory(GiB)": 68.5, "reward": 0.8462746441364288, "reward_std": 0.22090898752212523, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.2462746262550354, "rewards/ReportKG_Jaccard/std": 0.09247916787862778, "step": 645, "train_speed(iter/s)": 0.039558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.4, "completions/mean_length": 68.125, "completions/min_length": 51.4, "epoch": 0.13131313131313133, "grad_norm": 1.6629993915557861, "kl": 0.019097846001386642, "learning_rate": 2.6262626262626263e-06, "loss": 0.001597529463469982, "memory(GiB)": 68.5, "reward": 1.0629679083824157, "reward_std": 0.16186525225639342, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.28796790838241576, "rewards/ReportKG_Jaccard/std": 0.10374877452850342, "step": 650, "train_speed(iter/s)": 0.039561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 69.25, "completions/min_length": 50.0, "epoch": 0.13232323232323231, "grad_norm": 1.588690996170044, "kl": 0.02539397794753313, "learning_rate": 2.6464646464646466e-06, "loss": 0.0007601380348205566, "memory(GiB)": 68.5, "reward": 0.7363324791193009, "reward_std": 0.2910207100212574, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.18633247613906861, "rewards/ReportKG_Jaccard/std": 0.051857287809252736, "step": 655, "train_speed(iter/s)": 0.039438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/mean_length": 71.875, "completions/min_length": 52.0, "epoch": 0.13333333333333333, "grad_norm": 1.2600975036621094, "kl": 0.018668036535382272, "learning_rate": 2.6666666666666664e-06, "loss": -0.017712594568729402, "memory(GiB)": 68.5, "reward": 0.9719547390937805, "reward_std": 0.2066115364432335, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.17195475697517396, "rewards/ReportKG_Jaccard/std": 0.07621757462620735, "step": 660, "train_speed(iter/s)": 0.039452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.0, "completions/mean_length": 64.05, "completions/min_length": 50.6, "epoch": 0.13434343434343435, "grad_norm": 1.7458336353302002, "kl": 0.03390205092728138, "learning_rate": 2.6868686868686867e-06, "loss": 0.02232755869626999, "memory(GiB)": 68.5, "reward": 0.6713563799858093, "reward_std": 0.47217610478401184, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.41916738748550414, "rewards/ReportKG_Jaccard/mean": 0.22135640382766725, "rewards/ReportKG_Jaccard/std": 0.07841874398291111, "step": 665, "train_speed(iter/s)": 0.0395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/mean_length": 78.05, "completions/min_length": 56.4, "epoch": 0.13535353535353536, "grad_norm": 1.3086662292480469, "kl": 0.020628476701676845, "learning_rate": 2.707070707070707e-06, "loss": -0.02443169355392456, "memory(GiB)": 68.5, "reward": 0.6997994601726532, "reward_std": 0.3940268710255623, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.1747994601726532, "rewards/ReportKG_Jaccard/std": 0.05868461616337299, "step": 670, "train_speed(iter/s)": 0.039455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.4, "completions/mean_length": 78.7, "completions/min_length": 58.6, "epoch": 0.13636363636363635, "grad_norm": 1.8087555170059204, "kl": 0.025321127846837044, "learning_rate": 2.727272727272727e-06, "loss": -0.03520652651786804, "memory(GiB)": 68.5, "reward": 0.8645887017250061, "reward_std": 0.47220492362976074, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.23958869129419327, "rewards/ReportKG_Jaccard/std": 0.05681227035820484, "step": 675, "train_speed(iter/s)": 0.039525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.6, "completions/mean_length": 79.225, "completions/min_length": 61.6, "epoch": 0.13737373737373737, "grad_norm": 1.2978334426879883, "kl": 0.018598266504704954, "learning_rate": 2.747474747474747e-06, "loss": 0.011860303580760956, "memory(GiB)": 68.5, "reward": 0.8384049952030181, "reward_std": 0.4682290077209473, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4410387217998505, "rewards/ReportKG_Jaccard/mean": 0.2634049832820892, "rewards/ReportKG_Jaccard/std": 0.053022689372301104, "step": 680, "train_speed(iter/s)": 0.039414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 113.6, "completions/mean_length": 88.85, "completions/min_length": 61.4, "epoch": 0.1383838383838384, "grad_norm": 1.1970250606536865, "kl": 0.01993091944605112, "learning_rate": 2.7676767676767678e-06, "loss": -0.010761469602584839, "memory(GiB)": 68.5, "reward": 0.9005372941493988, "reward_std": 0.13842977881431578, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.09258201122283935, "rewards/ReportKG_Jaccard/mean": 0.15053730309009553, "rewards/ReportKG_Jaccard/std": 0.05947345420718193, "step": 685, "train_speed(iter/s)": 0.039276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.8, "completions/mean_length": 65.7, "completions/min_length": 54.0, "epoch": 0.1393939393939394, "grad_norm": 1.6220810413360596, "kl": 0.04013685919344425, "learning_rate": 2.787878787878788e-06, "loss": 0.01488061249256134, "memory(GiB)": 68.5, "reward": 1.018332827091217, "reward_std": 0.26603986620903014, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.24333284497261048, "rewards/ReportKG_Jaccard/std": 0.07188625410199165, "step": 690, "train_speed(iter/s)": 0.039303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.4, "completions/mean_length": 72.85, "completions/min_length": 56.0, "epoch": 0.1404040404040404, "grad_norm": 1.2674789428710938, "kl": 0.03274518642574549, "learning_rate": 2.808080808080808e-06, "loss": -0.0004032808355987072, "memory(GiB)": 68.5, "reward": 0.7178504347801209, "reward_std": 0.4557041108608246, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.24285042583942412, "rewards/ReportKG_Jaccard/std": 0.06925657540559768, "step": 695, "train_speed(iter/s)": 0.039188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.6, "completions/mean_length": 72.4, "completions/min_length": 55.0, "epoch": 0.1414141414141414, "grad_norm": 1.4440639019012451, "kl": 0.02169136255979538, "learning_rate": 2.828282828282828e-06, "loss": 0.054046142101287845, "memory(GiB)": 68.5, "reward": 0.8868330836296081, "reward_std": 0.4822139322757721, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.21183308959007263, "rewards/ReportKG_Jaccard/std": 0.06437852270901204, "step": 700, "train_speed(iter/s)": 0.03924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 79.675, "completions/min_length": 62.4, "epoch": 0.14242424242424243, "grad_norm": 1.552746057510376, "kl": 0.019572589173913003, "learning_rate": 2.8484848484848484e-06, "loss": 0.03401562571525574, "memory(GiB)": 68.5, "reward": 0.7456775486469269, "reward_std": 0.4551581978797913, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.17067755460739137, "rewards/ReportKG_Jaccard/std": 0.05416046231985092, "step": 705, "train_speed(iter/s)": 0.03925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.4, "completions/mean_length": 81.95, "completions/min_length": 57.8, "epoch": 0.14343434343434344, "grad_norm": 1.3673573732376099, "kl": 0.01744922809302807, "learning_rate": 2.8686868686868683e-06, "loss": -0.0038102611899375914, "memory(GiB)": 68.5, "reward": 0.7477305382490158, "reward_std": 0.23688654452562333, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.1851640224456787, "rewards/ReportKG_Jaccard/mean": 0.2477305144071579, "rewards/ReportKG_Jaccard/std": 0.05957964584231377, "step": 710, "train_speed(iter/s)": 0.039287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.0, "completions/mean_length": 85.55, "completions/min_length": 60.4, "epoch": 0.14444444444444443, "grad_norm": 1.4721547365188599, "kl": 0.01784121599048376, "learning_rate": 2.8888888888888886e-06, "loss": 0.02380070984363556, "memory(GiB)": 68.5, "reward": 0.7851297378540039, "reward_std": 0.3707249477505684, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.185129714012146, "rewards/ReportKG_Jaccard/std": 0.06394318342208863, "step": 715, "train_speed(iter/s)": 0.039187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.0, "completions/mean_length": 70.925, "completions/min_length": 50.8, "epoch": 0.14545454545454545, "grad_norm": 1.204360008239746, "kl": 0.029993564076721668, "learning_rate": 2.909090909090909e-06, "loss": 0.024702277779579163, "memory(GiB)": 68.5, "reward": 0.674073526263237, "reward_std": 0.17492494210600854, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.09258201122283935, "rewards/ReportKG_Jaccard/mean": 0.22407355010509492, "rewards/ReportKG_Jaccard/std": 0.08330952003598213, "step": 720, "train_speed(iter/s)": 0.039198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.4, "completions/mean_length": 73.65, "completions/min_length": 53.8, "epoch": 0.14646464646464646, "grad_norm": 1.28031587600708, "kl": 0.03066256046295166, "learning_rate": 2.9292929292929295e-06, "loss": 0.02127360850572586, "memory(GiB)": 68.5, "reward": 0.7769637256860733, "reward_std": 0.2750044822692871, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.20701966285705567, "rewards/ReportKG_Jaccard/mean": 0.1769637420773506, "rewards/ReportKG_Jaccard/std": 0.08229718506336212, "step": 725, "train_speed(iter/s)": 0.039154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.4, "completions/mean_length": 74.0, "completions/min_length": 52.0, "epoch": 0.14747474747474748, "grad_norm": 1.530759572982788, "kl": 0.019686240702867508, "learning_rate": 2.9494949494949494e-06, "loss": -0.0036130651831626894, "memory(GiB)": 68.5, "reward": 0.7143745064735413, "reward_std": 0.3974539488554001, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.239374515414238, "rewards/ReportKG_Jaccard/std": 0.09311287999153137, "step": 730, "train_speed(iter/s)": 0.039179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/mean_length": 76.325, "completions/min_length": 59.2, "epoch": 0.1484848484848485, "grad_norm": 1.0026589632034302, "kl": 0.07013174183666707, "learning_rate": 2.9696969696969697e-06, "loss": 0.01506439745426178, "memory(GiB)": 68.5, "reward": 0.7469920516014099, "reward_std": 0.3457429312169552, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.22199206352233886, "rewards/ReportKG_Jaccard/std": 0.050460445880889895, "step": 735, "train_speed(iter/s)": 0.039177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.0, "completions/mean_length": 63.525, "completions/min_length": 47.8, "epoch": 0.1494949494949495, "grad_norm": 1.3337963819503784, "kl": 0.047727656364440915, "learning_rate": 2.98989898989899e-06, "loss": 0.07381022572517396, "memory(GiB)": 68.5, "reward": 1.1854561924934388, "reward_std": 0.16648023277521135, "rewards/MultiModalAccuracyORM_Any/mean": 0.95, "rewards/MultiModalAccuracyORM_Any/std": 0.09258201122283935, "rewards/ReportKG_Jaccard/mean": 0.23545619547367097, "rewards/ReportKG_Jaccard/std": 0.07775134220719337, "step": 740, "train_speed(iter/s)": 0.039199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/mean_length": 70.5, "completions/min_length": 49.6, "epoch": 0.1505050505050505, "grad_norm": 1.374290108680725, "kl": 0.019335327483713628, "learning_rate": 3.01010101010101e-06, "loss": -0.09268648624420166, "memory(GiB)": 68.5, "reward": 0.7730594396591186, "reward_std": 0.42622352838516236, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.17305942326784135, "rewards/ReportKG_Jaccard/std": 0.07069497257471084, "step": 745, "train_speed(iter/s)": 0.039128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 141.4, "completions/mean_length": 95.65, "completions/min_length": 70.2, "epoch": 0.15151515151515152, "grad_norm": 1.2669302225112915, "kl": 0.015876923315227033, "learning_rate": 3.03030303030303e-06, "loss": 0.04733953475952148, "memory(GiB)": 68.5, "reward": 0.4476535230875015, "reward_std": 0.40210911333560945, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.3771016776561737, "rewards/ReportKG_Jaccard/mean": 0.172653529047966, "rewards/ReportKG_Jaccard/std": 0.054760759323835374, "step": 750, "train_speed(iter/s)": 0.03909 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.6, "completions/mean_length": 79.4, "completions/min_length": 58.0, "epoch": 0.15252525252525254, "grad_norm": 1.3172653913497925, "kl": 0.025805617682635784, "learning_rate": 3.0505050505050503e-06, "loss": -0.03474241495132446, "memory(GiB)": 68.5, "reward": 0.6499016225337982, "reward_std": 0.4514175593852997, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.43348987102508546, "rewards/ReportKG_Jaccard/mean": 0.1499016210436821, "rewards/ReportKG_Jaccard/std": 0.06665316820144654, "step": 755, "train_speed(iter/s)": 0.03898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.0, "completions/mean_length": 78.35, "completions/min_length": 61.4, "epoch": 0.15353535353535352, "grad_norm": 1.224544882774353, "kl": 0.05271086134016514, "learning_rate": 3.07070707070707e-06, "loss": -0.0443778395652771, "memory(GiB)": 68.5, "reward": 0.8465203762054443, "reward_std": 0.32657589167356493, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.22152037918567657, "rewards/ReportKG_Jaccard/std": 0.07645204290747643, "step": 760, "train_speed(iter/s)": 0.039025 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.0, "completions/mean_length": 78.775, "completions/min_length": 61.2, "epoch": 0.15454545454545454, "grad_norm": 1.1953483819961548, "kl": 0.01954235602170229, "learning_rate": 3.090909090909091e-06, "loss": 0.033939898014068604, "memory(GiB)": 68.5, "reward": 0.6891652762889862, "reward_std": 0.2805962055921555, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2391652762889862, "rewards/ReportKG_Jaccard/std": 0.056439464911818504, "step": 765, "train_speed(iter/s)": 0.038927 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 70.05, "completions/min_length": 50.6, "epoch": 0.15555555555555556, "grad_norm": 1.1591488122940063, "kl": 0.036461079679429534, "learning_rate": 3.111111111111111e-06, "loss": 0.04821138978004456, "memory(GiB)": 68.5, "reward": 0.7538141965866089, "reward_std": 0.478214305639267, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4410387217998505, "rewards/ReportKG_Jaccard/mean": 0.17881421372294426, "rewards/ReportKG_Jaccard/std": 0.07170832380652428, "step": 770, "train_speed(iter/s)": 0.038958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.8, "completions/mean_length": 80.775, "completions/min_length": 57.8, "epoch": 0.15656565656565657, "grad_norm": 1.089423418045044, "kl": 0.020473149977624415, "learning_rate": 3.1313131313131314e-06, "loss": -0.038487425446510314, "memory(GiB)": 68.5, "reward": 0.6987765535712243, "reward_std": 0.30876850336790085, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.14877652674913405, "rewards/ReportKG_Jaccard/std": 0.052114526927471164, "step": 775, "train_speed(iter/s)": 0.038922 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 76.525, "completions/min_length": 64.4, "epoch": 0.15757575757575756, "grad_norm": 1.162054181098938, "kl": 0.037478874810039996, "learning_rate": 3.1515151515151513e-06, "loss": 0.034347400069236755, "memory(GiB)": 68.5, "reward": 1.0284618616104126, "reward_std": 0.391802553832531, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.22846185714006423, "rewards/ReportKG_Jaccard/std": 0.0595189593732357, "step": 780, "train_speed(iter/s)": 0.038938 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.6, "completions/mean_length": 78.875, "completions/min_length": 59.4, "epoch": 0.15858585858585858, "grad_norm": 1.4163861274719238, "kl": 0.02498103529214859, "learning_rate": 3.1717171717171716e-06, "loss": 0.025979048013687132, "memory(GiB)": 68.5, "reward": 0.7279832363128662, "reward_std": 0.4690089046955109, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.45196654200553893, "rewards/ReportKG_Jaccard/mean": 0.17798324823379516, "rewards/ReportKG_Jaccard/std": 0.06191675066947937, "step": 785, "train_speed(iter/s)": 0.038997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.0, "completions/mean_length": 76.85, "completions/min_length": 49.2, "epoch": 0.1595959595959596, "grad_norm": 1.2265125513076782, "kl": 0.04899294972419739, "learning_rate": 3.191919191919192e-06, "loss": 0.08812929391860962, "memory(GiB)": 68.5, "reward": 0.7410695731639863, "reward_std": 0.30306353271007536, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.1910695880651474, "rewards/ReportKG_Jaccard/std": 0.06154468134045601, "step": 790, "train_speed(iter/s)": 0.039019 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.0, "completions/mean_length": 64.4, "completions/min_length": 48.6, "epoch": 0.1606060606060606, "grad_norm": 2.0622458457946777, "kl": 0.03304311558604241, "learning_rate": 3.2121212121212117e-06, "loss": 0.019598303735256194, "memory(GiB)": 68.5, "reward": 0.9567746162414551, "reward_std": 0.3075569078326225, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.18177463114261627, "rewards/ReportKG_Jaccard/std": 0.07515710741281509, "step": 795, "train_speed(iter/s)": 0.03896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.6, "completions/mean_length": 70.3, "completions/min_length": 56.4, "epoch": 0.16161616161616163, "grad_norm": 1.4795324802398682, "kl": 0.040943241119384764, "learning_rate": 3.232323232323232e-06, "loss": -0.032660830020904544, "memory(GiB)": 68.5, "reward": 1.0618180751800537, "reward_std": 0.2805647552013397, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.23681810796260833, "rewards/ReportKG_Jaccard/std": 0.07039741724729538, "step": 800, "train_speed(iter/s)": 0.039003 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.0, "completions/mean_length": 82.025, "completions/min_length": 63.0, "epoch": 0.16262626262626262, "grad_norm": 1.0055882930755615, "kl": 0.03591778837144375, "learning_rate": 3.2525252525252527e-06, "loss": 0.022162613272666932, "memory(GiB)": 68.5, "reward": 0.9729531764984131, "reward_std": 0.24113671779632567, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.24795317649841309, "rewards/ReportKG_Jaccard/std": 0.06611571609973907, "step": 805, "train_speed(iter/s)": 0.038985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.6, "completions/mean_length": 80.9, "completions/min_length": 58.6, "epoch": 0.16363636363636364, "grad_norm": 1.7402044534683228, "kl": 0.05162110924720764, "learning_rate": 3.272727272727273e-06, "loss": 0.03415307104587555, "memory(GiB)": 68.5, "reward": 0.8971518576145172, "reward_std": 0.3728098660707474, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.17215185165405272, "rewards/ReportKG_Jaccard/std": 0.04611608684062958, "step": 810, "train_speed(iter/s)": 0.038999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.0, "completions/mean_length": 69.825, "completions/min_length": 54.0, "epoch": 0.16464646464646465, "grad_norm": 1.488639235496521, "kl": 0.024755885638296605, "learning_rate": 3.2929292929292928e-06, "loss": 0.07623492479324341, "memory(GiB)": 68.5, "reward": 0.6003436088562012, "reward_std": 0.26998637318611146, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.19948650598526002, "rewards/ReportKG_Jaccard/mean": 0.25034359395503997, "rewards/ReportKG_Jaccard/std": 0.07911410927772522, "step": 815, "train_speed(iter/s)": 0.039045 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.4, "completions/mean_length": 65.65, "completions/min_length": 51.4, "epoch": 0.16565656565656567, "grad_norm": 1.098900556564331, "kl": 0.027234542928636075, "learning_rate": 3.313131313131313e-06, "loss": 0.020918408036231996, "memory(GiB)": 68.5, "reward": 0.9402597188949585, "reward_std": 0.4523775279521942, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.2402597337961197, "rewards/ReportKG_Jaccard/std": 0.07202996462583541, "step": 820, "train_speed(iter/s)": 0.039093 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.4, "completions/mean_length": 74.6, "completions/min_length": 57.8, "epoch": 0.16666666666666666, "grad_norm": 1.1986645460128784, "kl": 0.03899468127638102, "learning_rate": 3.3333333333333333e-06, "loss": 0.03357445895671844, "memory(GiB)": 68.5, "reward": 1.038748073577881, "reward_std": 0.4714795708656311, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.2887480825185776, "rewards/ReportKG_Jaccard/std": 0.08619363903999329, "step": 825, "train_speed(iter/s)": 0.039095 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.8, "completions/mean_length": 73.375, "completions/min_length": 55.8, "epoch": 0.16767676767676767, "grad_norm": 1.502769112586975, "kl": 0.02464079186320305, "learning_rate": 3.353535353535353e-06, "loss": 0.006424277275800705, "memory(GiB)": 68.5, "reward": 0.5784728288650512, "reward_std": 0.4862861156463623, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.2034728318452835, "rewards/ReportKG_Jaccard/std": 0.07439900636672973, "step": 830, "train_speed(iter/s)": 0.039046 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 125.6, "completions/mean_length": 80.025, "completions/min_length": 52.8, "epoch": 0.1686868686868687, "grad_norm": 1.511513352394104, "kl": 0.021152522414922714, "learning_rate": 3.3737373737373734e-06, "loss": -0.046140441298484804, "memory(GiB)": 68.5, "reward": 0.46795201003551484, "reward_std": 0.2891600549221039, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.1679520070552826, "rewards/ReportKG_Jaccard/std": 0.0641694724559784, "step": 835, "train_speed(iter/s)": 0.03905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.2, "completions/mean_length": 72.075, "completions/min_length": 58.2, "epoch": 0.1696969696969697, "grad_norm": 1.6890538930892944, "kl": 0.022666182741522788, "learning_rate": 3.3939393939393937e-06, "loss": 0.04218232035636902, "memory(GiB)": 68.5, "reward": 0.7673286318778991, "reward_std": 0.33381136804819106, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.2886738538742065, "rewards/ReportKG_Jaccard/mean": 0.19232860654592515, "rewards/ReportKG_Jaccard/std": 0.06115225665271282, "step": 840, "train_speed(iter/s)": 0.039086 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.8, "completions/mean_length": 67.2, "completions/min_length": 49.2, "epoch": 0.1707070707070707, "grad_norm": 1.1553573608398438, "kl": 0.03444953002035618, "learning_rate": 3.4141414141414136e-06, "loss": 0.04460963010787964, "memory(GiB)": 68.5, "reward": 0.7997817635536194, "reward_std": 0.3875963784754276, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.2247817575931549, "rewards/ReportKG_Jaccard/std": 0.05984063744544983, "step": 845, "train_speed(iter/s)": 0.039126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 111.0, "completions/mean_length": 84.5, "completions/min_length": 64.2, "epoch": 0.1717171717171717, "grad_norm": 1.3158124685287476, "kl": 0.0405407689511776, "learning_rate": 3.4343434343434343e-06, "loss": -0.007041454315185547, "memory(GiB)": 68.5, "reward": 1.0027103304862977, "reward_std": 0.2820084735751152, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.17771033495664595, "rewards/ReportKG_Jaccard/std": 0.059754344075918196, "step": 850, "train_speed(iter/s)": 0.039095 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 110.6, "completions/mean_length": 78.7, "completions/min_length": 52.8, "epoch": 0.17272727272727273, "grad_norm": 1.2308241128921509, "kl": 0.02312624454498291, "learning_rate": 3.4545454545454545e-06, "loss": -0.056117141246795656, "memory(GiB)": 68.5, "reward": 0.7422412008047103, "reward_std": 0.3491463929414749, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.30299633741378784, "rewards/ReportKG_Jaccard/mean": 0.21724119782447815, "rewards/ReportKG_Jaccard/std": 0.07310130670666695, "step": 855, "train_speed(iter/s)": 0.039035 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 75.025, "completions/min_length": 57.6, "epoch": 0.17373737373737375, "grad_norm": 1.132370114326477, "kl": 0.01944839023053646, "learning_rate": 3.474747474747475e-06, "loss": 0.01756485104560852, "memory(GiB)": 68.5, "reward": 0.6054902851581574, "reward_std": 0.349552583694458, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.30299633741378784, "rewards/ReportKG_Jaccard/mean": 0.2804902851581573, "rewards/ReportKG_Jaccard/std": 0.06637894734740257, "step": 860, "train_speed(iter/s)": 0.038957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.2, "completions/mean_length": 85.575, "completions/min_length": 59.0, "epoch": 0.17474747474747473, "grad_norm": 0.97343510389328, "kl": 0.05199473612010479, "learning_rate": 3.4949494949494947e-06, "loss": -0.0003444969654083252, "memory(GiB)": 68.5, "reward": 0.783398225903511, "reward_std": 0.19361822679638863, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.18339824974536895, "rewards/ReportKG_Jaccard/std": 0.0860351949930191, "step": 865, "train_speed(iter/s)": 0.038926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 76.2, "completions/min_length": 51.4, "epoch": 0.17575757575757575, "grad_norm": 1.1916035413742065, "kl": 0.026103384792804718, "learning_rate": 3.515151515151515e-06, "loss": -0.036760610342025754, "memory(GiB)": 68.5, "reward": 0.772384238243103, "reward_std": 0.3990656495094299, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.2473842293024063, "rewards/ReportKG_Jaccard/std": 0.076168941706419, "step": 870, "train_speed(iter/s)": 0.038848 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 110.4, "completions/mean_length": 81.65, "completions/min_length": 60.0, "epoch": 0.17676767676767677, "grad_norm": 1.288935661315918, "kl": 0.022030723839998247, "learning_rate": 3.5353535353535352e-06, "loss": -0.04023534059524536, "memory(GiB)": 68.5, "reward": 0.5928080826997757, "reward_std": 0.4236585423350334, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.16780808120965957, "rewards/ReportKG_Jaccard/std": 0.0766407735645771, "step": 875, "train_speed(iter/s)": 0.038791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.8, "completions/mean_length": 75.325, "completions/min_length": 58.0, "epoch": 0.17777777777777778, "grad_norm": 1.246311902999878, "kl": 0.023006285727024078, "learning_rate": 3.555555555555555e-06, "loss": -0.0006784072611480951, "memory(GiB)": 68.5, "reward": 0.6360428750514984, "reward_std": 0.4477119088172913, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.42594102025032043, "rewards/ReportKG_Jaccard/mean": 0.21104286313056947, "rewards/ReportKG_Jaccard/std": 0.04795922040939331, "step": 880, "train_speed(iter/s)": 0.038674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 119.8, "completions/mean_length": 88.825, "completions/min_length": 66.6, "epoch": 0.1787878787878788, "grad_norm": 1.9172621965408325, "kl": 0.023183613643050195, "learning_rate": 3.5757575757575753e-06, "loss": 0.017313672602176665, "memory(GiB)": 68.5, "reward": 0.8555826365947723, "reward_std": 0.3666342467069626, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.30639100074768066, "rewards/ReportKG_Jaccard/mean": 0.30558266341686247, "rewards/ReportKG_Jaccard/std": 0.07224676124751568, "step": 885, "train_speed(iter/s)": 0.038699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.8, "completions/mean_length": 82.15, "completions/min_length": 61.8, "epoch": 0.1797979797979798, "grad_norm": 1.3661775588989258, "kl": 0.04470670223236084, "learning_rate": 3.595959595959596e-06, "loss": 0.00445154495537281, "memory(GiB)": 68.5, "reward": 0.7305402144789696, "reward_std": 0.39532503187656404, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.15554021149873734, "rewards/ReportKG_Jaccard/std": 0.0499887615442276, "step": 890, "train_speed(iter/s)": 0.038709 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.2, "completions/mean_length": 66.35, "completions/min_length": 54.6, "epoch": 0.1808080808080808, "grad_norm": 1.2289105653762817, "kl": 0.03420211039483547, "learning_rate": 3.6161616161616163e-06, "loss": 0.003760361671447754, "memory(GiB)": 68.5, "reward": 0.7365811407566071, "reward_std": 0.43108542561531066, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.39729605317115785, "rewards/ReportKG_Jaccard/mean": 0.2615811288356781, "rewards/ReportKG_Jaccard/std": 0.11046685874462128, "step": 895, "train_speed(iter/s)": 0.038727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.4, "completions/mean_length": 73.125, "completions/min_length": 53.4, "epoch": 0.18181818181818182, "grad_norm": 1.4622653722763062, "kl": 0.049149074405431745, "learning_rate": 3.636363636363636e-06, "loss": 0.00777403861284256, "memory(GiB)": 68.5, "reward": 0.9838730335235596, "reward_std": 0.4644128501415253, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.2088730439543724, "rewards/ReportKG_Jaccard/std": 0.08319919630885124, "step": 900, "train_speed(iter/s)": 0.038721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.6, "completions/mean_length": 76.025, "completions/min_length": 56.2, "epoch": 0.18282828282828284, "grad_norm": 2.0312347412109375, "kl": 0.0387994147837162, "learning_rate": 3.6565656565656564e-06, "loss": 0.013680750131607055, "memory(GiB)": 68.5, "reward": 0.7918467581272125, "reward_std": 0.44881420135498046, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.2418467491865158, "rewards/ReportKG_Jaccard/std": 0.08131028264760971, "step": 905, "train_speed(iter/s)": 0.038683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 70.35, "completions/min_length": 54.6, "epoch": 0.18383838383838383, "grad_norm": 1.6597431898117065, "kl": 0.02022332642227411, "learning_rate": 3.6767676767676767e-06, "loss": 0.018839138746261596, "memory(GiB)": 68.5, "reward": 0.46563891768455506, "reward_std": 0.36093697398900987, "rewards/MultiModalAccuracyORM_Any/mean": 0.25, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.21563892513513566, "rewards/ReportKG_Jaccard/std": 0.05661410093307495, "step": 910, "train_speed(iter/s)": 0.038738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.0, "completions/mean_length": 76.475, "completions/min_length": 55.8, "epoch": 0.18484848484848485, "grad_norm": 1.2903039455413818, "kl": 0.04248388595879078, "learning_rate": 3.6969696969696966e-06, "loss": 0.08921950459480285, "memory(GiB)": 68.5, "reward": 0.5553222715854644, "reward_std": 0.31793116927146914, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.1803222715854645, "rewards/ReportKG_Jaccard/std": 0.08133556805551052, "step": 915, "train_speed(iter/s)": 0.038783 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.4, "completions/mean_length": 68.625, "completions/min_length": 58.0, "epoch": 0.18585858585858586, "grad_norm": 1.274975299835205, "kl": 0.07745856046676636, "learning_rate": 3.717171717171717e-06, "loss": 0.04921229481697083, "memory(GiB)": 68.5, "reward": 0.9196185350418091, "reward_std": 0.35160446912050247, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.16961855590343475, "rewards/ReportKG_Jaccard/std": 0.04896462559700012, "step": 920, "train_speed(iter/s)": 0.038849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.6, "completions/mean_length": 78.725, "completions/min_length": 62.8, "epoch": 0.18686868686868688, "grad_norm": 1.275202989578247, "kl": 0.03679501321166754, "learning_rate": 3.737373737373737e-06, "loss": 0.004498618841171265, "memory(GiB)": 68.5, "reward": 1.017687690258026, "reward_std": 0.3059808000922203, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2926876813173294, "rewards/ReportKG_Jaccard/std": 0.07683620527386666, "step": 925, "train_speed(iter/s)": 0.038847 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.0, "completions/mean_length": 78.025, "completions/min_length": 62.6, "epoch": 0.18787878787878787, "grad_norm": 1.2806388139724731, "kl": 0.025123779848217963, "learning_rate": 3.757575757575758e-06, "loss": -0.0022324712947010992, "memory(GiB)": 68.5, "reward": 0.7743969440460206, "reward_std": 0.5115850567817688, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4956935167312622, "rewards/ReportKG_Jaccard/mean": 0.19939695447683334, "rewards/ReportKG_Jaccard/std": 0.06263290420174598, "step": 930, "train_speed(iter/s)": 0.038845 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.0, "completions/mean_length": 73.4, "completions/min_length": 59.0, "epoch": 0.18888888888888888, "grad_norm": 1.160765528678894, "kl": 0.023153012990951537, "learning_rate": 3.7777777777777777e-06, "loss": 0.020245814323425294, "memory(GiB)": 68.5, "reward": 0.6395729392766952, "reward_std": 0.3817021906375885, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.1895729348063469, "rewards/ReportKG_Jaccard/std": 0.051149120554327965, "step": 935, "train_speed(iter/s)": 0.038738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 119.0, "completions/mean_length": 81.0, "completions/min_length": 53.0, "epoch": 0.1898989898989899, "grad_norm": 1.5214135646820068, "kl": 0.027527859807014464, "learning_rate": 3.797979797979798e-06, "loss": 0.01222202628850937, "memory(GiB)": 69.34, "reward": 0.6040517807006835, "reward_std": 0.38148421198129656, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.15405178517103196, "rewards/ReportKG_Jaccard/std": 0.06721272319555283, "step": 940, "train_speed(iter/s)": 0.038743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.6, "completions/mean_length": 69.525, "completions/min_length": 53.2, "epoch": 0.19090909090909092, "grad_norm": 1.6439745426177979, "kl": 0.03964070416986942, "learning_rate": 3.818181818181818e-06, "loss": 0.02536895275115967, "memory(GiB)": 69.34, "reward": 0.826170814037323, "reward_std": 0.4520430564880371, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.40650616884231566, "rewards/ReportKG_Jaccard/mean": 0.17617079317569734, "rewards/ReportKG_Jaccard/std": 0.0625217504799366, "step": 945, "train_speed(iter/s)": 0.038729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.6, "completions/mean_length": 76.575, "completions/min_length": 58.6, "epoch": 0.1919191919191919, "grad_norm": 1.6828774213790894, "kl": 0.021515587624162436, "learning_rate": 3.8383838383838385e-06, "loss": -0.02267075181007385, "memory(GiB)": 69.34, "reward": 0.5068772852420806, "reward_std": 0.32459720969200134, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.2068772941827774, "rewards/ReportKG_Jaccard/std": 0.06348284035921097, "step": 950, "train_speed(iter/s)": 0.038735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 70.925, "completions/min_length": 53.6, "epoch": 0.19292929292929292, "grad_norm": 1.124837875366211, "kl": 0.05560213439166546, "learning_rate": 3.858585858585859e-06, "loss": 0.009764151275157928, "memory(GiB)": 69.34, "reward": 0.8117639005184174, "reward_std": 0.2829137146472931, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.21176388263702392, "rewards/ReportKG_Jaccard/std": 0.06599815413355828, "step": 955, "train_speed(iter/s)": 0.038753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.0, "completions/mean_length": 81.025, "completions/min_length": 59.0, "epoch": 0.19393939393939394, "grad_norm": 1.6440997123718262, "kl": 0.03047732785344124, "learning_rate": 3.878787878787879e-06, "loss": -0.0324880838394165, "memory(GiB)": 69.34, "reward": 0.5823818683624268, "reward_std": 0.38056460320949553, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.23238186836242675, "rewards/ReportKG_Jaccard/std": 0.08404957205057144, "step": 960, "train_speed(iter/s)": 0.038775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.2, "completions/mean_length": 65.225, "completions/min_length": 52.4, "epoch": 0.19494949494949496, "grad_norm": 1.3175181150436401, "kl": 0.02200741898268461, "learning_rate": 3.8989898989898984e-06, "loss": -0.008184242248535156, "memory(GiB)": 69.34, "reward": 0.7080448389053344, "reward_std": 0.3188824370503426, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.3330448389053345, "rewards/ReportKG_Jaccard/std": 0.06883075907826423, "step": 965, "train_speed(iter/s)": 0.038823 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.6, "completions/mean_length": 67.575, "completions/min_length": 47.0, "epoch": 0.19595959595959597, "grad_norm": 1.669192910194397, "kl": 0.024916011095046996, "learning_rate": 3.919191919191919e-06, "loss": -0.012352780252695084, "memory(GiB)": 69.34, "reward": 0.5035060763359069, "reward_std": 0.3852567583322525, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.22850607037544252, "rewards/ReportKG_Jaccard/std": 0.07854758724570274, "step": 970, "train_speed(iter/s)": 0.03881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.6, "completions/mean_length": 74.2, "completions/min_length": 58.4, "epoch": 0.19696969696969696, "grad_norm": 1.140015721321106, "kl": 0.034292767941951754, "learning_rate": 3.939393939393939e-06, "loss": -0.009510629624128342, "memory(GiB)": 69.34, "reward": 1.0242047309875488, "reward_std": 0.39253018498420716, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.24920476973056793, "rewards/ReportKG_Jaccard/std": 0.0758708618581295, "step": 975, "train_speed(iter/s)": 0.038873 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 125.0, "completions/mean_length": 97.175, "completions/min_length": 74.0, "epoch": 0.19797979797979798, "grad_norm": 1.5548375844955444, "kl": 0.03465084135532379, "learning_rate": 3.959595959595959e-06, "loss": -0.009736534208059311, "memory(GiB)": 69.34, "reward": 0.6111080348491669, "reward_std": 0.3788094073534012, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.13610803186893464, "rewards/ReportKG_Jaccard/std": 0.0533867597579956, "step": 980, "train_speed(iter/s)": 0.038784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 115.0, "completions/mean_length": 87.425, "completions/min_length": 68.2, "epoch": 0.198989898989899, "grad_norm": 1.2997499704360962, "kl": 0.05213622748851776, "learning_rate": 3.9797979797979795e-06, "loss": 0.02234429121017456, "memory(GiB)": 69.34, "reward": 0.46999540328979494, "reward_std": 0.40172292590141295, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.16999541223049164, "rewards/ReportKG_Jaccard/std": 0.0506893515586853, "step": 985, "train_speed(iter/s)": 0.038763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.0, "completions/mean_length": 85.7, "completions/min_length": 72.0, "epoch": 0.2, "grad_norm": 1.5630958080291748, "kl": 0.028531961143016815, "learning_rate": 4e-06, "loss": 0.012811888754367829, "memory(GiB)": 69.34, "reward": 0.8487338960170746, "reward_std": 0.48540291786193845, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.46289436221122743, "rewards/ReportKG_Jaccard/mean": 0.22373391389846803, "rewards/ReportKG_Jaccard/std": 0.06435786485671997, "step": 990, "train_speed(iter/s)": 0.038813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 71.6, "completions/min_length": 53.0, "epoch": 0.201010101010101, "grad_norm": 1.2831370830535889, "kl": 0.02675914131104946, "learning_rate": 3.999984265649774e-06, "loss": 0.06354296803474427, "memory(GiB)": 69.34, "reward": 0.984251719713211, "reward_std": 0.32080049365758895, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.1842517226934433, "rewards/ReportKG_Jaccard/std": 0.06363382525742053, "step": 995, "train_speed(iter/s)": 0.038764 }, { "epoch": 0.20202020202020202, "grad_norm": 1.3129075765609741, "learning_rate": 3.999937062846666e-06, "loss": 0.020880439877510072, "memory(GiB)": 69.34, "step": 1000, "train_speed(iter/s)": 0.038826 }, { "epoch": 0.20202020202020202, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 102.34, "eval_completions/mean_length": 79.3625, "eval_completions/min_length": 61.14, "eval_kl": 0.03246727695688605, "eval_loss": 0.017378615215420723, "eval_reward": 0.7136600729823113, "eval_reward_std": 0.33131871923804285, "eval_rewards/MultiModalAccuracyORM_Any/mean": 0.51, "eval_rewards/MultiModalAccuracyORM_Any/std": 0.2871864545345306, "eval_rewards/ReportKG_Jaccard/mean": 0.20366006970405579, "eval_rewards/ReportKG_Jaccard/std": 0.06330945491790771, "eval_runtime": 902.9869, "eval_samples_per_second": 0.055, "eval_steps_per_second": 0.008, "step": 1000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.8, "completions/mean_length": 75.4, "completions/min_length": 55.0, "epoch": 0.20303030303030303, "grad_norm": 1.6278780698776245, "kl": 0.03614179622381926, "learning_rate": 3.9998583923333814e-06, "loss": 0.06172839403152466, "memory(GiB)": 69.34, "reward": 0.8450858473777771, "reward_std": 0.37426090352237223, "rewards/MultiModalAccuracyORM_Any/mean": 0.5875, "rewards/MultiModalAccuracyORM_Any/std": 0.3265775293111801, "rewards/ReportKG_Jaccard/mean": 0.2575858473777771, "rewards/ReportKG_Jaccard/std": 0.07852655351161957, "step": 1005, "train_speed(iter/s)": 0.037414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.0, "completions/mean_length": 67.925, "completions/min_length": 46.0, "epoch": 0.20404040404040405, "grad_norm": 2.0161445140838623, "kl": 0.03262691460549831, "learning_rate": 3.9997482553477505e-06, "loss": -0.03743865191936493, "memory(GiB)": 69.34, "reward": 0.7477865934371948, "reward_std": 0.42075752168893815, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.3955783486366272, "rewards/ReportKG_Jaccard/mean": 0.17278658598661423, "rewards/ReportKG_Jaccard/std": 0.057706959545612335, "step": 1010, "train_speed(iter/s)": 0.037412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.2, "completions/mean_length": 74.575, "completions/min_length": 60.8, "epoch": 0.20505050505050504, "grad_norm": 0.9157541394233704, "kl": 0.038740452378988266, "learning_rate": 3.999606653622705e-06, "loss": 0.009682652354240418, "memory(GiB)": 69.34, "reward": 0.957771897315979, "reward_std": 0.42957242131233214, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.20777189880609512, "rewards/ReportKG_Jaccard/std": 0.06048981621861458, "step": 1015, "train_speed(iter/s)": 0.037468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.2, "completions/mean_length": 70.0, "completions/min_length": 52.2, "epoch": 0.20606060606060606, "grad_norm": 1.1444168090820312, "kl": 0.04684444256126881, "learning_rate": 3.999433589386258e-06, "loss": 0.04799893498420715, "memory(GiB)": 69.34, "reward": 1.1299414157867431, "reward_std": 0.3488317713141441, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.27994142174720765, "rewards/ReportKG_Jaccard/std": 0.05751045867800712, "step": 1020, "train_speed(iter/s)": 0.037463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.0, "completions/mean_length": 70.825, "completions/min_length": 51.6, "epoch": 0.20707070707070707, "grad_norm": 1.1115009784698486, "kl": 0.02991071008145809, "learning_rate": 3.9992290653614625e-06, "loss": 0.04760961830615997, "memory(GiB)": 69.34, "reward": 0.8334036529064178, "reward_std": 0.41865582317113875, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3771016776561737, "rewards/ReportKG_Jaccard/mean": 0.20840363204479218, "rewards/ReportKG_Jaccard/std": 0.06402484066784382, "step": 1025, "train_speed(iter/s)": 0.037465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.8, "completions/mean_length": 68.5, "completions/min_length": 54.4, "epoch": 0.2080808080808081, "grad_norm": 1.0234990119934082, "kl": 0.04695476219058037, "learning_rate": 3.99899308476637e-06, "loss": 0.06375235915184022, "memory(GiB)": 69.34, "reward": 0.8133358612656594, "reward_std": 0.14831309616565705, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.23833587318658828, "rewards/ReportKG_Jaccard/std": 0.08574636206030846, "step": 1030, "train_speed(iter/s)": 0.037511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.0, "completions/mean_length": 62.05, "completions/min_length": 46.4, "epoch": 0.20909090909090908, "grad_norm": 2.0179057121276855, "kl": 0.025699202716350556, "learning_rate": 3.998725651313984e-06, "loss": 0.04427437782287598, "memory(GiB)": 69.34, "reward": 0.8190256893634796, "reward_std": 0.3228685587644577, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.244025719165802, "rewards/ReportKG_Jaccard/std": 0.05841932594776154, "step": 1035, "train_speed(iter/s)": 0.037523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 73.275, "completions/min_length": 60.2, "epoch": 0.2101010101010101, "grad_norm": 1.6833901405334473, "kl": 0.023891088739037513, "learning_rate": 3.998426769212193e-06, "loss": 0.024789471924304963, "memory(GiB)": 69.34, "reward": 0.5012933313846588, "reward_std": 0.2866771623492241, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.22629333138465882, "rewards/ReportKG_Jaccard/std": 0.0897998109459877, "step": 1040, "train_speed(iter/s)": 0.037502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 67.525, "completions/min_length": 51.4, "epoch": 0.2111111111111111, "grad_norm": 1.353227972984314, "kl": 0.03913901560008526, "learning_rate": 3.998096443163716e-06, "loss": -0.031100985407829285, "memory(GiB)": 69.34, "reward": 0.6578051149845123, "reward_std": 0.3965238943696022, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.18280513137578963, "rewards/ReportKG_Jaccard/std": 0.05167805776000023, "step": 1045, "train_speed(iter/s)": 0.037464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 75.225, "completions/min_length": 53.8, "epoch": 0.21212121212121213, "grad_norm": 1.7208319902420044, "kl": 0.023162856325507165, "learning_rate": 3.997734678366016e-06, "loss": -0.02069974094629288, "memory(GiB)": 69.34, "reward": 0.7745181918144226, "reward_std": 0.3162837907671928, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.19951818883419037, "rewards/ReportKG_Jaccard/std": 0.058213062956929205, "step": 1050, "train_speed(iter/s)": 0.037425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.8, "completions/mean_length": 82.35, "completions/min_length": 62.8, "epoch": 0.21313131313131314, "grad_norm": 1.5086846351623535, "kl": 0.05377155058085918, "learning_rate": 3.997341480511229e-06, "loss": -0.026097068190574647, "memory(GiB)": 69.34, "reward": 0.6587137401103973, "reward_std": 0.43104385733604433, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.20871374607086182, "rewards/ReportKG_Jaccard/std": 0.04419204592704773, "step": 1055, "train_speed(iter/s)": 0.037389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 79.4, "completions/mean_length": 62.8, "completions/min_length": 50.2, "epoch": 0.21414141414141413, "grad_norm": 2.3065717220306396, "kl": 0.03138445168733597, "learning_rate": 3.996916855786066e-06, "loss": -0.007674118876457215, "memory(GiB)": 69.34, "reward": 0.7956344783306122, "reward_std": 0.16381949782371522, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.10350983142852783, "rewards/ReportKG_Jaccard/mean": 0.27063448429107667, "rewards/ReportKG_Jaccard/std": 0.08714990466833114, "step": 1060, "train_speed(iter/s)": 0.037429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.8, "completions/mean_length": 85.675, "completions/min_length": 62.0, "epoch": 0.21515151515151515, "grad_norm": 1.3989421129226685, "kl": 0.02568243145942688, "learning_rate": 3.996460810871722e-06, "loss": -0.002877366542816162, "memory(GiB)": 69.34, "reward": 0.6310175687074662, "reward_std": 0.4698172926902771, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.20601755529642105, "rewards/ReportKG_Jaccard/std": 0.0617173433303833, "step": 1065, "train_speed(iter/s)": 0.03742 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.6, "completions/mean_length": 82.725, "completions/min_length": 59.0, "epoch": 0.21616161616161617, "grad_norm": 0.80694180727005, "kl": 0.024114641174674035, "learning_rate": 3.995973352943769e-06, "loss": 0.036093896627426146, "memory(GiB)": 69.34, "reward": 0.7296357035636902, "reward_std": 0.3695313483476639, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.22963569164276124, "rewards/ReportKG_Jaccard/std": 0.07426499202847481, "step": 1070, "train_speed(iter/s)": 0.037447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.4, "completions/mean_length": 71.4, "completions/min_length": 55.0, "epoch": 0.21717171717171718, "grad_norm": 1.5232353210449219, "kl": 0.033975695818662645, "learning_rate": 3.995454489672038e-06, "loss": -0.014702796936035156, "memory(GiB)": 69.34, "reward": 0.9565002202987671, "reward_std": 0.4091541051864624, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.20650021433830262, "rewards/ReportKG_Jaccard/std": 0.07746300101280212, "step": 1075, "train_speed(iter/s)": 0.037458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 79.6, "completions/mean_length": 65.525, "completions/min_length": 54.2, "epoch": 0.21818181818181817, "grad_norm": 1.7107998132705688, "kl": 0.04646092802286148, "learning_rate": 3.994904229220507e-06, "loss": 0.0372814416885376, "memory(GiB)": 69.34, "reward": 0.9101016998291016, "reward_std": 0.41895403861999514, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.21010173559188844, "rewards/ReportKG_Jaccard/std": 0.0834547907114029, "step": 1080, "train_speed(iter/s)": 0.0375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.4, "completions/mean_length": 65.4, "completions/min_length": 51.8, "epoch": 0.2191919191919192, "grad_norm": 1.395262598991394, "kl": 0.025181958824396132, "learning_rate": 3.994322580247166e-06, "loss": 0.055949842929840087, "memory(GiB)": 69.34, "reward": 0.7753119170665741, "reward_std": 0.29885063916444776, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.30031192898750303, "rewards/ReportKG_Jaccard/std": 0.06274867504835129, "step": 1085, "train_speed(iter/s)": 0.037509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 77.0, "completions/mean_length": 60.65, "completions/min_length": 48.8, "epoch": 0.2202020202020202, "grad_norm": 1.4006297588348389, "kl": 0.034170908853411674, "learning_rate": 3.993709551903885e-06, "loss": 0.021196283400058746, "memory(GiB)": 69.34, "reward": 0.9627059102058411, "reward_std": 0.23136057555675507, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.26270588040351867, "rewards/ReportKG_Jaccard/std": 0.07386855259537697, "step": 1090, "train_speed(iter/s)": 0.037523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.4, "completions/mean_length": 70.4, "completions/min_length": 56.2, "epoch": 0.22121212121212122, "grad_norm": 1.4289703369140625, "kl": 0.0279248233884573, "learning_rate": 3.993065153836265e-06, "loss": 0.05192852020263672, "memory(GiB)": 69.34, "reward": 0.9445156693458557, "reward_std": 0.38047201484441756, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.21951568722724915, "rewards/ReportKG_Jaccard/std": 0.06934608295559883, "step": 1095, "train_speed(iter/s)": 0.03753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.6, "completions/mean_length": 70.475, "completions/min_length": 57.2, "epoch": 0.2222222222222222, "grad_norm": 1.1525421142578125, "kl": 0.0465335164219141, "learning_rate": 3.992389396183491e-06, "loss": 0.0337557315826416, "memory(GiB)": 69.34, "reward": 0.7765922784805298, "reward_std": 0.5244037330150604, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.47382218241691587, "rewards/ReportKG_Jaccard/mean": 0.22659227550029754, "rewards/ReportKG_Jaccard/std": 0.07681610845029355, "step": 1100, "train_speed(iter/s)": 0.037589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.4, "completions/mean_length": 75.575, "completions/min_length": 57.2, "epoch": 0.22323232323232323, "grad_norm": 1.400551676750183, "kl": 0.03554810099303722, "learning_rate": 3.991682289578172e-06, "loss": 0.01562044322490692, "memory(GiB)": 69.34, "reward": 0.921914917230606, "reward_std": 0.2812621220946312, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.27191494703292846, "rewards/ReportKG_Jaccard/std": 0.07027920335531235, "step": 1105, "train_speed(iter/s)": 0.037568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.2, "completions/mean_length": 75.55, "completions/min_length": 55.4, "epoch": 0.22424242424242424, "grad_norm": 1.3887560367584229, "kl": 0.03504023589193821, "learning_rate": 3.990943845146169e-06, "loss": -0.005322375893592834, "memory(GiB)": 69.34, "reward": 0.5983000993728638, "reward_std": 0.3205729633569717, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.19830008447170258, "rewards/ReportKG_Jaccard/std": 0.06725248731672764, "step": 1110, "train_speed(iter/s)": 0.037547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.4, "completions/mean_length": 85.3, "completions/min_length": 65.0, "epoch": 0.22525252525252526, "grad_norm": 1.0364408493041992, "kl": 0.049072707444429396, "learning_rate": 3.9901740745064276e-06, "loss": 0.06617981791496277, "memory(GiB)": 69.34, "reward": 0.96074138879776, "reward_std": 0.2910213112831116, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.23574139326810836, "rewards/ReportKG_Jaccard/std": 0.0774584949016571, "step": 1115, "train_speed(iter/s)": 0.037593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.6, "completions/mean_length": 86.075, "completions/min_length": 57.6, "epoch": 0.22626262626262628, "grad_norm": 1.2653803825378418, "kl": 0.039151545986533164, "learning_rate": 3.989372989770786e-06, "loss": 0.036958417296409606, "memory(GiB)": 69.34, "reward": 0.7887044280767441, "reward_std": 0.2868495166301727, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.138704377412796, "rewards/ReportKG_Jaccard/std": 0.09988042563199998, "step": 1120, "train_speed(iter/s)": 0.037541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.8, "completions/mean_length": 79.825, "completions/min_length": 60.6, "epoch": 0.22727272727272727, "grad_norm": 1.5528911352157593, "kl": 0.03634990528225899, "learning_rate": 3.988540603543794e-06, "loss": -0.025877803564071655, "memory(GiB)": 69.34, "reward": 0.6820615291595459, "reward_std": 0.5041311383247375, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.48475000262260437, "rewards/ReportKG_Jaccard/mean": 0.25706155598163605, "rewards/ReportKG_Jaccard/std": 0.08351343050599098, "step": 1125, "train_speed(iter/s)": 0.03759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.4, "completions/mean_length": 73.95, "completions/min_length": 57.2, "epoch": 0.22828282828282828, "grad_norm": 1.2659410238265991, "kl": 0.032391649670898914, "learning_rate": 3.987676928922508e-06, "loss": 0.08481761813163757, "memory(GiB)": 69.34, "reward": 0.9674754023551941, "reward_std": 0.3102507501840591, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.292475401610136, "rewards/ReportKG_Jaccard/std": 0.08148068003356457, "step": 1130, "train_speed(iter/s)": 0.037609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.2, "completions/mean_length": 73.85, "completions/min_length": 53.0, "epoch": 0.2292929292929293, "grad_norm": 1.4675836563110352, "kl": 0.04135896489024162, "learning_rate": 3.986781979496286e-06, "loss": 0.12256299257278443, "memory(GiB)": 69.34, "reward": 0.8143474698066712, "reward_std": 0.4736528337001801, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.23934744596481322, "rewards/ReportKG_Jaccard/std": 0.0668502315878868, "step": 1135, "train_speed(iter/s)": 0.037637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 67.775, "completions/min_length": 50.6, "epoch": 0.23030303030303031, "grad_norm": 1.3106136322021484, "kl": 0.049182210490107535, "learning_rate": 3.9858557693465764e-06, "loss": 0.1008985161781311, "memory(GiB)": 69.34, "reward": 0.9957339525222778, "reward_std": 0.3321593105792999, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.30299633741378784, "rewards/ReportKG_Jaccard/mean": 0.22073397785425186, "rewards/ReportKG_Jaccard/std": 0.08645325936377049, "step": 1140, "train_speed(iter/s)": 0.037673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.2, "completions/mean_length": 61.95, "completions/min_length": 47.6, "epoch": 0.2313131313131313, "grad_norm": 0.6914095878601074, "kl": 0.04737057350575924, "learning_rate": 3.984898313046693e-06, "loss": 0.03188638985157013, "memory(GiB)": 69.34, "reward": 1.0303220748901367, "reward_std": 0.37231272011995314, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.20532208681106567, "rewards/ReportKG_Jaccard/std": 0.07306597977876664, "step": 1145, "train_speed(iter/s)": 0.037718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.6, "completions/mean_length": 65.35, "completions/min_length": 50.4, "epoch": 0.23232323232323232, "grad_norm": 1.4622445106506348, "kl": 0.05559075772762299, "learning_rate": 3.98390962566159e-06, "loss": 0.06677940487861633, "memory(GiB)": 69.34, "reward": 0.7757134467363358, "reward_std": 0.3780220985412598, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.20071343183517457, "rewards/ReportKG_Jaccard/std": 0.05798598900437355, "step": 1150, "train_speed(iter/s)": 0.03767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.4, "completions/mean_length": 67.25, "completions/min_length": 53.4, "epoch": 0.23333333333333334, "grad_norm": 1.5182512998580933, "kl": 0.046110909804701804, "learning_rate": 3.982889722747621e-06, "loss": 0.017089655995368956, "memory(GiB)": 69.34, "reward": 0.851487010717392, "reward_std": 0.21627142280340195, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.2264869898557663, "rewards/ReportKG_Jaccard/std": 0.07119922116398811, "step": 1155, "train_speed(iter/s)": 0.037716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.8, "completions/mean_length": 70.975, "completions/min_length": 46.0, "epoch": 0.23434343434343435, "grad_norm": 1.5000725984573364, "kl": 0.03626285046339035, "learning_rate": 3.981838620352293e-06, "loss": 0.07091997265815735, "memory(GiB)": 69.34, "reward": 0.5882613599300385, "reward_std": 0.3594621330499649, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.31392415761947634, "rewards/ReportKG_Jaccard/mean": 0.1382613390684128, "rewards/ReportKG_Jaccard/std": 0.07277992814779281, "step": 1160, "train_speed(iter/s)": 0.037679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.0, "completions/mean_length": 69.275, "completions/min_length": 53.2, "epoch": 0.23535353535353534, "grad_norm": 1.3190475702285767, "kl": 0.02903086058795452, "learning_rate": 3.980756335014023e-06, "loss": 0.08712048530578613, "memory(GiB)": 69.34, "reward": 0.5866510689258575, "reward_std": 0.3599342152476311, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.23665108978748323, "rewards/ReportKG_Jaccard/std": 0.06663270443677902, "step": 1165, "train_speed(iter/s)": 0.037684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 76.0, "completions/mean_length": 62.3, "completions/min_length": 50.4, "epoch": 0.23636363636363636, "grad_norm": 1.5018070936203003, "kl": 0.05066148713231087, "learning_rate": 3.979642883761865e-06, "loss": 0.04146312475204468, "memory(GiB)": 69.34, "reward": 0.7199047684669495, "reward_std": 0.39784502387046816, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.39729605317115785, "rewards/ReportKG_Jaccard/mean": 0.1949047863483429, "rewards/ReportKG_Jaccard/std": 0.07507876679301262, "step": 1170, "train_speed(iter/s)": 0.037673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.2, "completions/mean_length": 77.725, "completions/min_length": 61.6, "epoch": 0.23737373737373738, "grad_norm": 1.336875557899475, "kl": 0.06331971138715745, "learning_rate": 3.978498284115252e-06, "loss": 0.05069239735603333, "memory(GiB)": 69.34, "reward": 1.2521058082580567, "reward_std": 0.20220561772584916, "rewards/MultiModalAccuracyORM_Any/mean": 0.95, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.3021058201789856, "rewards/ReportKG_Jaccard/std": 0.08810963481664658, "step": 1175, "train_speed(iter/s)": 0.037723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 80.375, "completions/min_length": 58.8, "epoch": 0.2383838383838384, "grad_norm": 1.5907793045043945, "kl": 0.0361514113843441, "learning_rate": 3.977322554083715e-06, "loss": -0.023838087916374207, "memory(GiB)": 69.34, "reward": 0.8425023168325424, "reward_std": 0.32918872088193896, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.19250229597091675, "rewards/ReportKG_Jaccard/std": 0.07626467421650887, "step": 1180, "train_speed(iter/s)": 0.037735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 68.975, "completions/min_length": 52.8, "epoch": 0.23939393939393938, "grad_norm": 1.1144709587097168, "kl": 0.05692397505044937, "learning_rate": 3.976115712166603e-06, "loss": 0.10046510696411133, "memory(GiB)": 69.34, "reward": 0.9719534873962402, "reward_std": 0.5008451700210571, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.4519508481025696, "rewards/ReportKG_Jaccard/mean": 0.2469535067677498, "rewards/ReportKG_Jaccard/std": 0.0862042060121894, "step": 1185, "train_speed(iter/s)": 0.037782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.8, "completions/mean_length": 71.85, "completions/min_length": 53.0, "epoch": 0.2404040404040404, "grad_norm": 1.3727778196334839, "kl": 0.032233943790197374, "learning_rate": 3.974877777352788e-06, "loss": -0.03595666885375977, "memory(GiB)": 69.34, "reward": 0.8707323551177979, "reward_std": 0.49378654956817625, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.46289436221122743, "rewards/ReportKG_Jaccard/mean": 0.2457323580980301, "rewards/ReportKG_Jaccard/std": 0.0789295606315136, "step": 1190, "train_speed(iter/s)": 0.037767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 111.6, "completions/mean_length": 86.6, "completions/min_length": 57.4, "epoch": 0.24141414141414141, "grad_norm": 1.4396249055862427, "kl": 0.034284995123744014, "learning_rate": 3.973608769120372e-06, "loss": -0.047753292322158816, "memory(GiB)": 69.34, "reward": 0.6003662407398224, "reward_std": 0.5378655254840851, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.4881446659564972, "rewards/ReportKG_Jaccard/mean": 0.2003662422299385, "rewards/ReportKG_Jaccard/std": 0.06770590990781784, "step": 1195, "train_speed(iter/s)": 0.037771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 72.65, "completions/min_length": 53.2, "epoch": 0.24242424242424243, "grad_norm": 1.1574689149856567, "kl": 0.050847319141030314, "learning_rate": 3.972308707436373e-06, "loss": 0.06026275753974915, "memory(GiB)": 69.34, "reward": 0.9124453067779541, "reward_std": 0.35521670877933503, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.1874453067779541, "rewards/ReportKG_Jaccard/std": 0.07434583455324173, "step": 1200, "train_speed(iter/s)": 0.037758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 76.725, "completions/min_length": 60.8, "epoch": 0.24343434343434345, "grad_norm": 1.605225920677185, "kl": 0.049206047877669336, "learning_rate": 3.970977612756418e-06, "loss": 0.06305267810821533, "memory(GiB)": 69.34, "reward": 0.8080949068069458, "reward_std": 0.4801642596721649, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.23309489488601684, "rewards/ReportKG_Jaccard/std": 0.05182976946234703, "step": 1205, "train_speed(iter/s)": 0.037771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 125.6, "completions/mean_length": 95.325, "completions/min_length": 76.8, "epoch": 0.24444444444444444, "grad_norm": 1.0628224611282349, "kl": 0.029568561911582948, "learning_rate": 3.969615506024416e-06, "loss": -0.0398932009935379, "memory(GiB)": 69.34, "reward": 0.9714670896530151, "reward_std": 0.32582165151834486, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.24646710455417634, "rewards/ReportKG_Jaccard/std": 0.06036563515663147, "step": 1210, "train_speed(iter/s)": 0.037749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.2, "completions/mean_length": 67.675, "completions/min_length": 51.2, "epoch": 0.24545454545454545, "grad_norm": 1.7902992963790894, "kl": 0.02967483215034008, "learning_rate": 3.968222408672232e-06, "loss": -5.221925675868988e-05, "memory(GiB)": 69.34, "reward": 0.4548566222190857, "reward_std": 0.1501602813601494, "rewards/MultiModalAccuracyORM_Any/mean": 0.225, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.2298566222190857, "rewards/ReportKG_Jaccard/std": 0.08639039844274521, "step": 1215, "train_speed(iter/s)": 0.037799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.4, "completions/mean_length": 83.825, "completions/min_length": 67.8, "epoch": 0.24646464646464647, "grad_norm": 1.0623750686645508, "kl": 0.03145287148654461, "learning_rate": 3.966798342619348e-06, "loss": 0.006487099826335907, "memory(GiB)": 69.34, "reward": 0.7217339038848877, "reward_std": 0.1459686763584614, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.09258201122283935, "rewards/ReportKG_Jaccard/mean": 0.17173390388488768, "rewards/ReportKG_Jaccard/std": 0.05715916827321053, "step": 1220, "train_speed(iter/s)": 0.037717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.8, "completions/mean_length": 83.425, "completions/min_length": 58.6, "epoch": 0.2474747474747475, "grad_norm": 1.2193565368652344, "kl": 0.02786141522228718, "learning_rate": 3.965343330272516e-06, "loss": -0.013704493641853333, "memory(GiB)": 69.34, "reward": 0.8462760925292969, "reward_std": 0.3719656690955162, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.22127609252929686, "rewards/ReportKG_Jaccard/std": 0.04163195453584194, "step": 1225, "train_speed(iter/s)": 0.037691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 69.275, "completions/min_length": 52.6, "epoch": 0.24848484848484848, "grad_norm": 1.1800673007965088, "kl": 0.056045154482126235, "learning_rate": 3.963857394525413e-06, "loss": 0.10519223213195801, "memory(GiB)": 69.34, "reward": 1.0561432838439941, "reward_std": 0.22254112362861633, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.2811432987451553, "rewards/ReportKG_Jaccard/std": 0.07520797252655029, "step": 1230, "train_speed(iter/s)": 0.037714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.2, "completions/mean_length": 68.675, "completions/min_length": 53.2, "epoch": 0.2494949494949495, "grad_norm": 1.1286362409591675, "kl": 0.041346746310591695, "learning_rate": 3.962340558758271e-06, "loss": 0.06621257066726685, "memory(GiB)": 69.34, "reward": 0.9481193482875824, "reward_std": 0.35009093284606935, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.29811935126781464, "rewards/ReportKG_Jaccard/std": 0.12010959610342979, "step": 1235, "train_speed(iter/s)": 0.037735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.4, "completions/mean_length": 75.975, "completions/min_length": 54.4, "epoch": 0.2505050505050505, "grad_norm": 1.5579901933670044, "kl": 0.03695840584114194, "learning_rate": 3.960792846837514e-06, "loss": 0.018234562873840333, "memory(GiB)": 69.34, "reward": 0.8618029594421387, "reward_std": 0.3258839786052704, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.26180297285318377, "rewards/ReportKG_Jaccard/std": 0.08217547088861465, "step": 1240, "train_speed(iter/s)": 0.037729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.6, "completions/mean_length": 82.875, "completions/min_length": 65.8, "epoch": 0.2515151515151515, "grad_norm": 1.136184811592102, "kl": 0.06604565344750882, "learning_rate": 3.959214283115385e-06, "loss": 0.006125139445066452, "memory(GiB)": 69.34, "reward": 0.7858606219291687, "reward_std": 0.45821942687034606, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.21086061000823975, "rewards/ReportKG_Jaccard/std": 0.05662401765584946, "step": 1245, "train_speed(iter/s)": 0.037691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.2, "completions/mean_length": 74.4, "completions/min_length": 53.4, "epoch": 0.25252525252525254, "grad_norm": 1.3189107179641724, "kl": 0.03931228928267956, "learning_rate": 3.957604892429558e-06, "loss": 0.0392501562833786, "memory(GiB)": 69.34, "reward": 1.1748853921890259, "reward_std": 0.16940136328339578, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.10350983142852783, "rewards/ReportKG_Jaccard/mean": 0.24988537579774855, "rewards/ReportKG_Jaccard/std": 0.07726198509335518, "step": 1250, "train_speed(iter/s)": 0.037674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 126.4, "completions/mean_length": 87.4, "completions/min_length": 55.6, "epoch": 0.25353535353535356, "grad_norm": 1.3802508115768433, "kl": 0.02658671997487545, "learning_rate": 3.955964700102749e-06, "loss": -0.017865322530269623, "memory(GiB)": 69.34, "reward": 0.7288520216941834, "reward_std": 0.2720782995223999, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.253852054476738, "rewards/ReportKG_Jaccard/std": 0.07903902679681778, "step": 1255, "train_speed(iter/s)": 0.037666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.2, "completions/mean_length": 82.025, "completions/min_length": 64.8, "epoch": 0.2545454545454545, "grad_norm": 0.5823115706443787, "kl": 0.04559341594576836, "learning_rate": 3.954293731942319e-06, "loss": 0.03993050456047058, "memory(GiB)": 69.34, "reward": 1.000242418050766, "reward_std": 0.3422334149479866, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.2502424120903015, "rewards/ReportKG_Jaccard/std": 0.0886391744017601, "step": 1260, "train_speed(iter/s)": 0.03766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.2, "completions/mean_length": 69.675, "completions/min_length": 50.6, "epoch": 0.25555555555555554, "grad_norm": 1.6092745065689087, "kl": 0.0316575838252902, "learning_rate": 3.952592014239866e-06, "loss": 0.04161718785762787, "memory(GiB)": 69.34, "reward": 1.1132889270782471, "reward_std": 0.32827495634555814, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.26328890323638915, "rewards/ReportKG_Jaccard/std": 0.09580710232257843, "step": 1265, "train_speed(iter/s)": 0.037689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.4, "completions/mean_length": 64.925, "completions/min_length": 54.0, "epoch": 0.25656565656565655, "grad_norm": 1.6359686851501465, "kl": 0.04731528759002686, "learning_rate": 3.950859573770814e-06, "loss": 0.05690310001373291, "memory(GiB)": 69.34, "reward": 0.8754810810089111, "reward_std": 0.3630467116832733, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.22548109591007232, "rewards/ReportKG_Jaccard/std": 0.06794080175459385, "step": 1270, "train_speed(iter/s)": 0.037695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 116.8, "completions/mean_length": 85.775, "completions/min_length": 64.8, "epoch": 0.25757575757575757, "grad_norm": 1.4100050926208496, "kl": 0.0443169042468071, "learning_rate": 3.9490964377939855e-06, "loss": -0.04166666269302368, "memory(GiB)": 69.34, "reward": 0.9949700474739075, "reward_std": 0.41174395084381105, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.3897472023963928, "rewards/ReportKG_Jaccard/mean": 0.19497009366750717, "rewards/ReportKG_Jaccard/std": 0.04236833825707435, "step": 1275, "train_speed(iter/s)": 0.037689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.8, "completions/mean_length": 65.775, "completions/min_length": 53.0, "epoch": 0.2585858585858586, "grad_norm": 1.37874436378479, "kl": 0.04830462671816349, "learning_rate": 3.9473026340511815e-06, "loss": 0.05353777408599854, "memory(GiB)": 69.34, "reward": 0.8799993753433227, "reward_std": 0.34060871601104736, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.28112500309944155, "rewards/ReportKG_Jaccard/mean": 0.32999939620494845, "rewards/ReportKG_Jaccard/std": 0.09664169922471047, "step": 1280, "train_speed(iter/s)": 0.037715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.0, "completions/mean_length": 72.675, "completions/min_length": 60.4, "epoch": 0.2595959595959596, "grad_norm": 1.0936390161514282, "kl": 0.059057324379682544, "learning_rate": 3.945478190766738e-06, "loss": 0.02296128273010254, "memory(GiB)": 69.34, "reward": 0.9345580816268921, "reward_std": 0.36843113899230956, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.23455804884433745, "rewards/ReportKG_Jaccard/std": 0.06946505252271891, "step": 1285, "train_speed(iter/s)": 0.037721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 66.125, "completions/min_length": 45.8, "epoch": 0.2606060606060606, "grad_norm": 1.2273544073104858, "kl": 0.0325771689414978, "learning_rate": 3.943623136647083e-06, "loss": -0.015558063983917236, "memory(GiB)": 69.34, "reward": 0.6399750411510468, "reward_std": 0.27551629692316054, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.2104143261909485, "rewards/ReportKG_Jaccard/mean": 0.21497501134872438, "rewards/ReportKG_Jaccard/std": 0.06916208565235138, "step": 1290, "train_speed(iter/s)": 0.037731 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.8, "completions/mean_length": 75.95, "completions/min_length": 56.0, "epoch": 0.26161616161616164, "grad_norm": 1.2504887580871582, "kl": 0.047468240931630136, "learning_rate": 3.94173750088029e-06, "loss": 0.043458253145217896, "memory(GiB)": 69.34, "reward": 0.8990687906742096, "reward_std": 0.43262051343917846, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.14906877726316453, "rewards/ReportKG_Jaccard/std": 0.05354262068867684, "step": 1295, "train_speed(iter/s)": 0.037751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.2, "completions/mean_length": 79.425, "completions/min_length": 55.8, "epoch": 0.26262626262626265, "grad_norm": 1.1749227046966553, "kl": 0.05092538855969906, "learning_rate": 3.93982131313561e-06, "loss": -0.024759522080421446, "memory(GiB)": 69.34, "reward": 0.8619691252708435, "reward_std": 0.47854477167129517, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.46628902554512025, "rewards/ReportKG_Jaccard/mean": 0.16196912974119188, "rewards/ReportKG_Jaccard/std": 0.043878303840756416, "step": 1300, "train_speed(iter/s)": 0.037771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.0, "completions/mean_length": 66.525, "completions/min_length": 53.6, "epoch": 0.2636363636363636, "grad_norm": 2.107431173324585, "kl": 0.03599601686000824, "learning_rate": 3.937874603563015e-06, "loss": 0.01518486589193344, "memory(GiB)": 69.34, "reward": 0.6330452144145966, "reward_std": 0.3889531344175339, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.23304519951343536, "rewards/ReportKG_Jaccard/std": 0.06315385475754738, "step": 1305, "train_speed(iter/s)": 0.037757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.8, "completions/mean_length": 66.575, "completions/min_length": 57.8, "epoch": 0.26464646464646463, "grad_norm": 1.3580293655395508, "kl": 0.052577169984579085, "learning_rate": 3.935897402792712e-06, "loss": 0.014828208088874816, "memory(GiB)": 69.34, "reward": 0.7105203628540039, "reward_std": 0.43450949192047117, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.18552035093307495, "rewards/ReportKG_Jaccard/std": 0.04059676062315702, "step": 1310, "train_speed(iter/s)": 0.037791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.4, "completions/mean_length": 76.95, "completions/min_length": 57.4, "epoch": 0.26565656565656565, "grad_norm": 1.1311978101730347, "kl": 0.07533379420638084, "learning_rate": 3.9338897419346734e-06, "loss": 0.03799906075000763, "memory(GiB)": 69.34, "reward": 0.7452306658029556, "reward_std": 0.33428588658571246, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.1952306866645813, "rewards/ReportKG_Jaccard/std": 0.04377842359244823, "step": 1315, "train_speed(iter/s)": 0.037801 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.8, "completions/mean_length": 74.15, "completions/min_length": 50.8, "epoch": 0.26666666666666666, "grad_norm": 1.0453732013702393, "kl": 0.02633534763008356, "learning_rate": 3.931851652578136e-06, "loss": 0.0811096966266632, "memory(GiB)": 69.34, "reward": 0.7322624564170838, "reward_std": 0.4093534529209137, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3897472023963928, "rewards/ReportKG_Jaccard/mean": 0.23226244896650314, "rewards/ReportKG_Jaccard/std": 0.061149094998836515, "step": 1320, "train_speed(iter/s)": 0.037831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 71.35, "completions/min_length": 54.0, "epoch": 0.2676767676767677, "grad_norm": 1.388681411743164, "kl": 0.05699365846812725, "learning_rate": 3.929783166791114e-06, "loss": 0.0040205646306276325, "memory(GiB)": 69.34, "reward": 0.797004234790802, "reward_std": 0.4572190582752228, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.45196654200553893, "rewards/ReportKG_Jaccard/mean": 0.1970041960477829, "rewards/ReportKG_Jaccard/std": 0.05285092815756798, "step": 1325, "train_speed(iter/s)": 0.037857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.4, "completions/mean_length": 77.2, "completions/min_length": 57.2, "epoch": 0.2686868686868687, "grad_norm": 1.8136413097381592, "kl": 0.04663765542209149, "learning_rate": 3.927684317119884e-06, "loss": -0.018616603314876558, "memory(GiB)": 69.34, "reward": 1.0303948402404786, "reward_std": 0.4038385145366192, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.25539483726024625, "rewards/ReportKG_Jaccard/std": 0.06207528114318848, "step": 1330, "train_speed(iter/s)": 0.037883 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.4, "completions/mean_length": 66.975, "completions/min_length": 52.2, "epoch": 0.2696969696969697, "grad_norm": 2.143841028213501, "kl": 0.05566960833966732, "learning_rate": 3.925555136588484e-06, "loss": 0.06755694150924682, "memory(GiB)": 69.34, "reward": 0.6021918952465057, "reward_std": 0.4318364322185516, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.3846505284309387, "rewards/ReportKG_Jaccard/mean": 0.25219190269708636, "rewards/ReportKG_Jaccard/std": 0.08640828281641007, "step": 1335, "train_speed(iter/s)": 0.037849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 113.6, "completions/mean_length": 90.875, "completions/min_length": 70.4, "epoch": 0.27070707070707073, "grad_norm": 1.1421701908111572, "kl": 0.039573464542627335, "learning_rate": 3.923395658698186e-06, "loss": 0.00975365862250328, "memory(GiB)": 69.34, "reward": 0.6604676604270935, "reward_std": 0.4138111680746078, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.21046767234802247, "rewards/ReportKG_Jaccard/std": 0.07019823007285594, "step": 1340, "train_speed(iter/s)": 0.037867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 68.15, "completions/min_length": 48.6, "epoch": 0.2717171717171717, "grad_norm": 1.3854254484176636, "kl": 0.028586264327168466, "learning_rate": 3.921205917426971e-06, "loss": 0.03973667621612549, "memory(GiB)": 69.34, "reward": 0.9045744419097901, "reward_std": 0.46891980767250063, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.1795744091272354, "rewards/ReportKG_Jaccard/std": 0.06682002805173397, "step": 1345, "train_speed(iter/s)": 0.037842 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.0, "completions/mean_length": 70.975, "completions/min_length": 54.6, "epoch": 0.2727272727272727, "grad_norm": 1.215344786643982, "kl": 0.044979045540094374, "learning_rate": 3.9189859472289945e-06, "loss": 0.04732637107372284, "memory(GiB)": 69.34, "reward": 1.0230871081352233, "reward_std": 0.36969054043293, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.28112500309944155, "rewards/ReportKG_Jaccard/mean": 0.22308709919452668, "rewards/ReportKG_Jaccard/std": 0.10637341700494289, "step": 1350, "train_speed(iter/s)": 0.037869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 70.575, "completions/min_length": 49.0, "epoch": 0.2737373737373737, "grad_norm": 1.4362411499023438, "kl": 0.03275910690426827, "learning_rate": 3.9167357830340465e-06, "loss": 0.02726973593235016, "memory(GiB)": 69.34, "reward": 0.6329097867012023, "reward_std": 0.39006741642951964, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.23290978372097015, "rewards/ReportKG_Jaccard/std": 0.06729584597051144, "step": 1355, "train_speed(iter/s)": 0.037863 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 72.475, "completions/min_length": 55.6, "epoch": 0.27474747474747474, "grad_norm": 1.4520323276519775, "kl": 0.03609677031636238, "learning_rate": 3.914455460246997e-06, "loss": -0.008902376145124435, "memory(GiB)": 69.34, "reward": 0.7410373389720917, "reward_std": 0.4200188085436821, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.2160373330116272, "rewards/ReportKG_Jaccard/std": 0.05954287722706795, "step": 1360, "train_speed(iter/s)": 0.037855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 69.325, "completions/min_length": 54.2, "epoch": 0.27575757575757576, "grad_norm": 1.060223937034607, "kl": 0.03430369906127453, "learning_rate": 3.912145014747245e-06, "loss": -0.011936287581920623, "memory(GiB)": 69.34, "reward": 0.8566668570041657, "reward_std": 0.4126329779624939, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3552303433418274, "rewards/ReportKG_Jaccard/mean": 0.2566668391227722, "rewards/ReportKG_Jaccard/std": 0.09183581098914147, "step": 1365, "train_speed(iter/s)": 0.037831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.6, "completions/mean_length": 77.1, "completions/min_length": 58.2, "epoch": 0.2767676767676768, "grad_norm": 1.497517466545105, "kl": 0.02443648613989353, "learning_rate": 3.909804482888147e-06, "loss": 0.020661261677742005, "memory(GiB)": 69.34, "reward": 0.8909147262573243, "reward_std": 0.38054416626691817, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.21591472029685974, "rewards/ReportKG_Jaccard/std": 0.06716175451874733, "step": 1370, "train_speed(iter/s)": 0.037829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 115.0, "completions/mean_length": 80.825, "completions/min_length": 65.2, "epoch": 0.2777777777777778, "grad_norm": 1.2374606132507324, "kl": 0.03473352920264006, "learning_rate": 3.907433901496454e-06, "loss": -0.022225187718868257, "memory(GiB)": 69.34, "reward": 0.4767670750617981, "reward_std": 0.3611712411046028, "rewards/MultiModalAccuracyORM_Any/mean": 0.25, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.22676708698272705, "rewards/ReportKG_Jaccard/std": 0.05189434215426445, "step": 1375, "train_speed(iter/s)": 0.037839 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 77.6, "completions/mean_length": 62.775, "completions/min_length": 50.8, "epoch": 0.2787878787878788, "grad_norm": 1.7139538526535034, "kl": 0.05606270208954811, "learning_rate": 3.905033307871721e-06, "loss": 0.03893179893493652, "memory(GiB)": 69.34, "reward": 0.8583240747451782, "reward_std": 0.4076854422688484, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.3484567105770111, "rewards/ReportKG_Jaccard/mean": 0.18332407921552657, "rewards/ReportKG_Jaccard/std": 0.08014959469437599, "step": 1380, "train_speed(iter/s)": 0.037865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.4, "completions/mean_length": 67.575, "completions/min_length": 54.4, "epoch": 0.2797979797979798, "grad_norm": 1.5061225891113281, "kl": 0.039508603140711786, "learning_rate": 3.90260273978573e-06, "loss": 0.023830023407936097, "memory(GiB)": 69.34, "reward": 0.7234757602214813, "reward_std": 0.4648399353027344, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.22347577810287475, "rewards/ReportKG_Jaccard/std": 0.06832618564367295, "step": 1385, "train_speed(iter/s)": 0.037898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 73.05, "completions/min_length": 55.4, "epoch": 0.2808080808080808, "grad_norm": 1.1254788637161255, "kl": 0.030542326904833317, "learning_rate": 3.900142235481891e-06, "loss": 0.032735639810562135, "memory(GiB)": 69.34, "reward": 0.9511264085769653, "reward_std": 0.44342744946479795, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.41916738748550414, "rewards/ReportKG_Jaccard/mean": 0.25112643241882326, "rewards/ReportKG_Jaccard/std": 0.06895224153995513, "step": 1390, "train_speed(iter/s)": 0.03792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.8, "completions/mean_length": 74.775, "completions/min_length": 54.6, "epoch": 0.2818181818181818, "grad_norm": 1.1380743980407715, "kl": 0.030244965106248856, "learning_rate": 3.897651833674639e-06, "loss": -0.04076708555221557, "memory(GiB)": 69.34, "reward": 0.838195514678955, "reward_std": 0.4373758316040039, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.18819552212953566, "rewards/ReportKG_Jaccard/std": 0.06244275569915771, "step": 1395, "train_speed(iter/s)": 0.037887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 78.2, "completions/min_length": 59.4, "epoch": 0.2828282828282828, "grad_norm": 1.4439488649368286, "kl": 0.03315922170877457, "learning_rate": 3.895131573548829e-06, "loss": -0.01443106085062027, "memory(GiB)": 69.34, "reward": 0.740540188550949, "reward_std": 0.3816703334450722, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.19054018557071686, "rewards/ReportKG_Jaccard/std": 0.08527851030230522, "step": 1400, "train_speed(iter/s)": 0.037902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.6, "completions/mean_length": 70.45, "completions/min_length": 52.0, "epoch": 0.28383838383838383, "grad_norm": 1.7637512683868408, "kl": 0.031282835826277736, "learning_rate": 3.8925814947591175e-06, "loss": -0.0072405852377414705, "memory(GiB)": 69.34, "reward": 0.6137541025876999, "reward_std": 0.2824831821024418, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.1887540876865387, "rewards/ReportKG_Jaccard/std": 0.04908522181212902, "step": 1405, "train_speed(iter/s)": 0.037911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 78.65, "completions/min_length": 62.8, "epoch": 0.28484848484848485, "grad_norm": 1.1173393726348877, "kl": 0.05284435898065567, "learning_rate": 3.890001637429336e-06, "loss": 0.039338475465774535, "memory(GiB)": 69.34, "reward": 1.0552676677703858, "reward_std": 0.3350810259580612, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.18026767373085023, "rewards/ReportKG_Jaccard/std": 0.07282281816005706, "step": 1410, "train_speed(iter/s)": 0.037892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.6, "completions/mean_length": 78.325, "completions/min_length": 65.2, "epoch": 0.28585858585858587, "grad_norm": 1.2668094635009766, "kl": 0.04418725818395615, "learning_rate": 3.8873920421518645e-06, "loss": 0.009610839188098907, "memory(GiB)": 69.34, "reward": 0.7566811144351959, "reward_std": 0.2798940002918243, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2066811203956604, "rewards/ReportKG_Jaccard/std": 0.06722868382930755, "step": 1415, "train_speed(iter/s)": 0.037922 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.2, "completions/mean_length": 83.0, "completions/min_length": 63.8, "epoch": 0.2868686868686869, "grad_norm": 1.425176739692688, "kl": 0.03205958902835846, "learning_rate": 3.8847527499869874e-06, "loss": -0.017807257175445557, "memory(GiB)": 69.34, "reward": 0.951313179731369, "reward_std": 0.31137654185295105, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.27631318271160127, "rewards/ReportKG_Jaccard/std": 0.08307917714118958, "step": 1420, "train_speed(iter/s)": 0.037923 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.0, "completions/mean_length": 67.8, "completions/min_length": 52.2, "epoch": 0.2878787878787879, "grad_norm": 1.0929889678955078, "kl": 0.037016705796122554, "learning_rate": 3.882083802462254e-06, "loss": 0.04413489699363708, "memory(GiB)": 69.34, "reward": 0.7431703209877014, "reward_std": 0.4955131411552429, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.46289436221122743, "rewards/ReportKG_Jaccard/mean": 0.2681703269481659, "rewards/ReportKG_Jaccard/std": 0.06952281445264816, "step": 1425, "train_speed(iter/s)": 0.037918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.6, "completions/mean_length": 74.275, "completions/min_length": 56.2, "epoch": 0.28888888888888886, "grad_norm": 0.8582306504249573, "kl": 0.04996984228491783, "learning_rate": 3.8793852415718165e-06, "loss": 0.09367237091064454, "memory(GiB)": 69.34, "reward": 1.0301637291908263, "reward_std": 0.2606611609458923, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.2551637515425682, "rewards/ReportKG_Jaccard/std": 0.06867481842637062, "step": 1430, "train_speed(iter/s)": 0.03794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.6, "completions/mean_length": 81.6, "completions/min_length": 57.4, "epoch": 0.2898989898989899, "grad_norm": 1.6704673767089844, "kl": 0.030087661184370517, "learning_rate": 3.8766571097757795e-06, "loss": -0.02281261533498764, "memory(GiB)": 69.34, "reward": 0.7383901476860046, "reward_std": 0.3798720747232437, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.21339015960693358, "rewards/ReportKG_Jaccard/std": 0.09376875311136246, "step": 1435, "train_speed(iter/s)": 0.037948 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.2, "completions/mean_length": 70.0, "completions/min_length": 54.0, "epoch": 0.2909090909090909, "grad_norm": 1.2085613012313843, "kl": 0.041638245433568956, "learning_rate": 3.873899449999524e-06, "loss": 0.024216191470623018, "memory(GiB)": 69.34, "reward": 0.5974529176950455, "reward_std": 0.28022863864898684, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.2104143261909485, "rewards/ReportKG_Jaccard/mean": 0.22245292961597443, "rewards/ReportKG_Jaccard/std": 0.07931898832321167, "step": 1440, "train_speed(iter/s)": 0.037951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 80.6, "completions/mean_length": 69.725, "completions/min_length": 55.2, "epoch": 0.2919191919191919, "grad_norm": 1.287447214126587, "kl": 0.04458688572049141, "learning_rate": 3.871112305633033e-06, "loss": 0.053639447689056395, "memory(GiB)": 69.34, "reward": 1.1568350434303283, "reward_std": 0.30322697162628176, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.30683502852916716, "rewards/ReportKG_Jaccard/std": 0.06724576652050018, "step": 1445, "train_speed(iter/s)": 0.037981 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.8, "completions/mean_length": 68.2, "completions/min_length": 47.0, "epoch": 0.29292929292929293, "grad_norm": 1.2354564666748047, "kl": 0.03354771919548512, "learning_rate": 3.8682957205302134e-06, "loss": -0.0571011483669281, "memory(GiB)": 69.34, "reward": 0.5919663608074188, "reward_std": 0.40120274722576144, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.16696636080741883, "rewards/ReportKG_Jaccard/std": 0.06399871855974197, "step": 1450, "train_speed(iter/s)": 0.038007 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.6, "completions/mean_length": 75.65, "completions/min_length": 57.2, "epoch": 0.29393939393939394, "grad_norm": 1.2275055646896362, "kl": 0.045228271186351775, "learning_rate": 3.865449739008201e-06, "loss": -0.028060990571975707, "memory(GiB)": 69.34, "reward": 0.5368335336446762, "reward_std": 0.33993841856718066, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.299601674079895, "rewards/ReportKG_Jaccard/mean": 0.18683353662490845, "rewards/ReportKG_Jaccard/std": 0.0647907830774784, "step": 1455, "train_speed(iter/s)": 0.038024 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 77.525, "completions/min_length": 58.0, "epoch": 0.29494949494949496, "grad_norm": 1.1178573369979858, "kl": 0.03812740705907345, "learning_rate": 3.862574405846667e-06, "loss": 0.07593544125556946, "memory(GiB)": 69.34, "reward": 0.7681787133216857, "reward_std": 0.4595360577106476, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.14317871630191803, "rewards/ReportKG_Jaccard/std": 0.05535076893866062, "step": 1460, "train_speed(iter/s)": 0.038048 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.4, "completions/mean_length": 65.075, "completions/min_length": 50.4, "epoch": 0.295959595959596, "grad_norm": 1.1562563180923462, "kl": 0.04950828105211258, "learning_rate": 3.85966976628711e-06, "loss": 0.03684889078140259, "memory(GiB)": 69.34, "reward": 0.770223754644394, "reward_std": 0.3197189927101135, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.2202237591147423, "rewards/ReportKG_Jaccard/std": 0.08014747388660907, "step": 1465, "train_speed(iter/s)": 0.038084 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 77.8, "completions/mean_length": 63.6, "completions/min_length": 45.6, "epoch": 0.296969696969697, "grad_norm": 1.3836541175842285, "kl": 0.0608673632144928, "learning_rate": 3.856735866032145e-06, "loss": 0.0718075156211853, "memory(GiB)": 69.34, "reward": 1.0450618386268615, "reward_std": 0.22007388696074487, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.22006182968616486, "rewards/ReportKG_Jaccard/std": 0.06884529255330563, "step": 1470, "train_speed(iter/s)": 0.038102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.6, "completions/mean_length": 72.2, "completions/min_length": 52.4, "epoch": 0.29797979797979796, "grad_norm": 1.17843759059906, "kl": 0.03265468142926693, "learning_rate": 3.853772751244788e-06, "loss": 0.015311795473098754, "memory(GiB)": 69.34, "reward": 0.6923770993947983, "reward_std": 0.3369146555662155, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.19237709939479827, "rewards/ReportKG_Jaccard/std": 0.11202971674501896, "step": 1475, "train_speed(iter/s)": 0.038104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.0, "completions/mean_length": 65.5, "completions/min_length": 44.2, "epoch": 0.298989898989899, "grad_norm": 1.4530915021896362, "kl": 0.04846903122961521, "learning_rate": 3.8507804685477215e-06, "loss": 0.08714322447776794, "memory(GiB)": 69.34, "reward": 0.9521992266178131, "reward_std": 0.1909773752093315, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.15219923555850984, "rewards/ReportKG_Jaccard/std": 0.09623494036495686, "step": 1480, "train_speed(iter/s)": 0.038098 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.6, "completions/mean_length": 70.825, "completions/min_length": 56.2, "epoch": 0.3, "grad_norm": 1.2533947229385376, "kl": 0.026285167410969736, "learning_rate": 3.847759065022573e-06, "loss": 0.020065876841545104, "memory(GiB)": 69.34, "reward": 0.516817569732666, "reward_std": 0.14422842040657996, "rewards/MultiModalAccuracyORM_Any/mean": 0.25, "rewards/MultiModalAccuracyORM_Any/std": 0.09258201122283935, "rewards/ReportKG_Jaccard/mean": 0.2668175607919693, "rewards/ReportKG_Jaccard/std": 0.07193915396928788, "step": 1485, "train_speed(iter/s)": 0.038115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.6, "completions/mean_length": 70.175, "completions/min_length": 50.8, "epoch": 0.301010101010101, "grad_norm": 1.1855398416519165, "kl": 0.05299968086183071, "learning_rate": 3.844708588209163e-06, "loss": 0.016307526826858522, "memory(GiB)": 69.34, "reward": 0.8540173172950745, "reward_std": 0.476984429359436, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.46289436221122743, "rewards/ReportKG_Jaccard/mean": 0.17901733815670012, "rewards/ReportKG_Jaccard/std": 0.044379997625947, "step": 1490, "train_speed(iter/s)": 0.038136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 69.85, "completions/min_length": 49.4, "epoch": 0.302020202020202, "grad_norm": 1.9264878034591675, "kl": 0.03979369215667248, "learning_rate": 3.841629086104761e-06, "loss": 0.060290420055389406, "memory(GiB)": 69.34, "reward": 0.9128664612770081, "reward_std": 0.3575418546795845, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.21286645531654358, "rewards/ReportKG_Jaccard/std": 0.07652179263532162, "step": 1495, "train_speed(iter/s)": 0.038128 }, { "epoch": 0.30303030303030304, "grad_norm": 1.482066035270691, "learning_rate": 3.838520607163331e-06, "loss": -0.0037732701748609543, "memory(GiB)": 69.34, "step": 1500, "train_speed(iter/s)": 0.038127 }, { "epoch": 0.30303030303030304, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 99.86, "eval_completions/mean_length": 76.265, "eval_completions/min_length": 54.66, "eval_kl": 0.04142370734363794, "eval_loss": -0.006677841767668724, "eval_reward": 0.7517347121238709, "eval_reward_std": 0.2634794845804572, "eval_rewards/MultiModalAccuracyORM_Any/mean": 0.5425, "eval_rewards/MultiModalAccuracyORM_Any/std": 0.2241564053297043, "eval_rewards/ReportKG_Jaccard/mean": 0.20923471676185726, "eval_rewards/ReportKG_Jaccard/std": 0.06625828260555863, "eval_runtime": 893.8377, "eval_samples_per_second": 0.056, "eval_steps_per_second": 0.008, "step": 1500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.5, "completions/mean_length": 73.0, "completions/min_length": 51.5, "epoch": 0.30404040404040406, "grad_norm": 1.9229750633239746, "kl": 0.03787486758083105, "learning_rate": 3.835383200294771e-06, "loss": 0.09076445698738098, "memory(GiB)": 69.34, "reward": 0.8201567500829696, "reward_std": 0.30569571182131766, "rewards/MultiModalAccuracyORM_Any/mean": 0.6125, "rewards/MultiModalAccuracyORM_Any/std": 0.261338609457016, "rewards/ReportKG_Jaccard/mean": 0.20765672624111176, "rewards/ReportKG_Jaccard/std": 0.06356591880321502, "step": 1505, "train_speed(iter/s)": 0.037219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.8, "completions/mean_length": 69.4, "completions/min_length": 52.4, "epoch": 0.30505050505050507, "grad_norm": 1.18679940700531, "kl": 0.0706282064318657, "learning_rate": 3.8322169148641384e-06, "loss": 0.017797188460826875, "memory(GiB)": 69.34, "reward": 0.8276043027639389, "reward_std": 0.41005704998970033, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.4116185367107391, "rewards/ReportKG_Jaccard/mean": 0.20260431617498398, "rewards/ReportKG_Jaccard/std": 0.05180828981101513, "step": 1510, "train_speed(iter/s)": 0.037255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.8, "completions/mean_length": 74.1, "completions/min_length": 58.0, "epoch": 0.30606060606060603, "grad_norm": 1.0896434783935547, "kl": 0.03906341344118118, "learning_rate": 3.829021800690879e-06, "loss": -0.04430626332759857, "memory(GiB)": 69.34, "reward": 0.7608364209532738, "reward_std": 0.2862086519598961, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.23583642095327378, "rewards/ReportKG_Jaccard/std": 0.061579905450344086, "step": 1515, "train_speed(iter/s)": 0.037265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.8, "completions/mean_length": 73.675, "completions/min_length": 53.6, "epoch": 0.30707070707070705, "grad_norm": 1.4366973638534546, "kl": 0.04809797629714012, "learning_rate": 3.8257979080480356e-06, "loss": 0.028177452087402344, "memory(GiB)": 69.34, "reward": 0.9593172132968902, "reward_std": 0.41970242112874984, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.23431723415851594, "rewards/ReportKG_Jaccard/std": 0.07980479635298252, "step": 1520, "train_speed(iter/s)": 0.037292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.0, "completions/mean_length": 78.675, "completions/min_length": 64.6, "epoch": 0.30808080808080807, "grad_norm": 1.5310481786727905, "kl": 0.04501180686056614, "learning_rate": 3.822545287661465e-06, "loss": 0.07843470573425293, "memory(GiB)": 69.34, "reward": 0.9680610597133636, "reward_std": 0.38897920697927474, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.2930610403418541, "rewards/ReportKG_Jaccard/std": 0.0716118760406971, "step": 1525, "train_speed(iter/s)": 0.037322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.6, "completions/mean_length": 74.875, "completions/min_length": 60.4, "epoch": 0.3090909090909091, "grad_norm": 1.5064328908920288, "kl": 0.04357686191797257, "learning_rate": 3.819263990709037e-06, "loss": 0.05163615942001343, "memory(GiB)": 69.34, "reward": 1.082338523864746, "reward_std": 0.3215319126844406, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.33233852982521056, "rewards/ReportKG_Jaccard/std": 0.07537182420492172, "step": 1530, "train_speed(iter/s)": 0.03734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.6, "completions/mean_length": 82.45, "completions/min_length": 58.4, "epoch": 0.3101010101010101, "grad_norm": 1.8224146366119385, "kl": 0.03342209756374359, "learning_rate": 3.815954068819825e-06, "loss": 0.0685445249080658, "memory(GiB)": 69.34, "reward": 0.8276676058769226, "reward_std": 0.4244075268507004, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.38124017119407655, "rewards/ReportKG_Jaccard/mean": 0.1776676207780838, "rewards/ReportKG_Jaccard/std": 0.05712215602397919, "step": 1535, "train_speed(iter/s)": 0.037347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 75.0, "completions/mean_length": 58.2, "completions/min_length": 48.4, "epoch": 0.3111111111111111, "grad_norm": 1.5717939138412476, "kl": 0.039313584193587305, "learning_rate": 3.8126155740732998e-06, "loss": 0.08169685006141662, "memory(GiB)": 69.34, "reward": 1.1988776683807374, "reward_std": 0.12845873683691025, "rewards/MultiModalAccuracyORM_Any/mean": 0.975, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.2238776758313179, "rewards/ReportKG_Jaccard/std": 0.06342656314373016, "step": 1540, "train_speed(iter/s)": 0.037355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 121.0, "completions/mean_length": 91.05, "completions/min_length": 64.2, "epoch": 0.31212121212121213, "grad_norm": 1.1889891624450684, "kl": 0.047249557077884675, "learning_rate": 3.8092485589985075e-06, "loss": -0.02311127632856369, "memory(GiB)": 69.34, "reward": 0.8716071605682373, "reward_std": 0.46244327425956727, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.4587401747703552, "rewards/ReportKG_Jaccard/mean": 0.19660715609788895, "rewards/ReportKG_Jaccard/std": 0.04962544813752175, "step": 1545, "train_speed(iter/s)": 0.037344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.2, "completions/mean_length": 80.85, "completions/min_length": 62.0, "epoch": 0.31313131313131315, "grad_norm": 1.5708264112472534, "kl": 0.03386615924537182, "learning_rate": 3.8058530765732426e-06, "loss": 0.025274685025215148, "memory(GiB)": 69.34, "reward": 0.7056200385093689, "reward_std": 0.40676812678575514, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.18062001168727876, "rewards/ReportKG_Jaccard/std": 0.065301950648427, "step": 1550, "train_speed(iter/s)": 0.037359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/mean_length": 75.575, "completions/min_length": 60.2, "epoch": 0.31414141414141417, "grad_norm": 1.325172781944275, "kl": 0.028365235775709152, "learning_rate": 3.802429180223214e-06, "loss": 0.026198071241378785, "memory(GiB)": 69.34, "reward": 0.6704662889242172, "reward_std": 0.44351791888475417, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3921836853027344, "rewards/ReportKG_Jaccard/mean": 0.22046628445386887, "rewards/ReportKG_Jaccard/std": 0.09158473014831543, "step": 1555, "train_speed(iter/s)": 0.037368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 72.975, "completions/min_length": 55.8, "epoch": 0.3151515151515151, "grad_norm": 1.183466911315918, "kl": 0.03827892132103443, "learning_rate": 3.7989769238212063e-06, "loss": 0.033207368850708005, "memory(GiB)": 69.34, "reward": 0.559584254026413, "reward_std": 0.48958690762519835, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.2845842480659485, "rewards/ReportKG_Jaccard/std": 0.0946392334997654, "step": 1560, "train_speed(iter/s)": 0.037349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.2, "completions/mean_length": 67.525, "completions/min_length": 51.2, "epoch": 0.31616161616161614, "grad_norm": 1.86776602268219, "kl": 0.051183593645691874, "learning_rate": 3.795496361686231e-06, "loss": 0.05487655997276306, "memory(GiB)": 69.34, "reward": 0.9914536416530609, "reward_std": 0.21358002498745918, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.2664536565542221, "rewards/ReportKG_Jaccard/std": 0.08073535487055779, "step": 1565, "train_speed(iter/s)": 0.037317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.4, "completions/mean_length": 74.375, "completions/min_length": 57.0, "epoch": 0.31717171717171716, "grad_norm": 1.6609241962432861, "kl": 0.03319739662110806, "learning_rate": 3.7919875485826714e-06, "loss": 0.04074929654598236, "memory(GiB)": 69.34, "reward": 0.8928148865699768, "reward_std": 0.3854381799697876, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.3863525390625, "rewards/ReportKG_Jaccard/mean": 0.21781489849090577, "rewards/ReportKG_Jaccard/std": 0.044328777492046355, "step": 1570, "train_speed(iter/s)": 0.037337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.8, "completions/mean_length": 76.325, "completions/min_length": 65.4, "epoch": 0.3181818181818182, "grad_norm": 1.1607195138931274, "kl": 0.03617999143898487, "learning_rate": 3.7884505397194224e-06, "loss": 0.036567994952201845, "memory(GiB)": 69.34, "reward": 1.0109285533428192, "reward_std": 0.3090016320347786, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.26092857122421265, "rewards/ReportKG_Jaccard/std": 0.07025494948029518, "step": 1575, "train_speed(iter/s)": 0.03736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.6, "completions/mean_length": 68.5, "completions/min_length": 51.2, "epoch": 0.3191919191919192, "grad_norm": 0.8166800737380981, "kl": 0.03130967020988464, "learning_rate": 3.7848853907490195e-06, "loss": -0.02739650011062622, "memory(GiB)": 69.34, "reward": 0.6316032469272613, "reward_std": 0.2975702002644539, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.15660325884819032, "rewards/ReportKG_Jaccard/std": 0.052585800737142564, "step": 1580, "train_speed(iter/s)": 0.037341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.0, "completions/mean_length": 73.05, "completions/min_length": 55.0, "epoch": 0.3202020202020202, "grad_norm": 1.3380573987960815, "kl": 0.03023604564368725, "learning_rate": 3.7812921577667655e-06, "loss": -0.025136882066726686, "memory(GiB)": 69.34, "reward": 0.7817078769207001, "reward_std": 0.38437521904706956, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.2317078649997711, "rewards/ReportKG_Jaccard/std": 0.05529894828796387, "step": 1585, "train_speed(iter/s)": 0.037362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/mean_length": 67.9, "completions/min_length": 51.0, "epoch": 0.3212121212121212, "grad_norm": 1.4673843383789062, "kl": 0.04091036729514599, "learning_rate": 3.777670897309847e-06, "loss": -0.0123284213244915, "memory(GiB)": 69.34, "reward": 0.5894495278596878, "reward_std": 0.3371498614549637, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.2845196664333344, "rewards/ReportKG_Jaccard/mean": 0.16444953680038452, "rewards/ReportKG_Jaccard/std": 0.0726212952286005, "step": 1590, "train_speed(iter/s)": 0.037357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 71.725, "completions/min_length": 52.0, "epoch": 0.32222222222222224, "grad_norm": 1.1123497486114502, "kl": 0.04303374923765659, "learning_rate": 3.774021666356443e-06, "loss": 0.06394056677818298, "memory(GiB)": 69.34, "reward": 0.9609409213066101, "reward_std": 0.3627630889415741, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.21094092726707458, "rewards/ReportKG_Jaccard/std": 0.07813438661396503, "step": 1595, "train_speed(iter/s)": 0.03736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 72.55, "completions/min_length": 53.0, "epoch": 0.32323232323232326, "grad_norm": 1.2518141269683838, "kl": 0.03837311416864395, "learning_rate": 3.7703445223248325e-06, "loss": 0.12613728046417236, "memory(GiB)": 69.34, "reward": 0.8597625762224197, "reward_std": 0.3182139217853546, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.28476257622241974, "rewards/ReportKG_Jaccard/std": 0.07598069719970227, "step": 1600, "train_speed(iter/s)": 0.037376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.8, "completions/mean_length": 81.4, "completions/min_length": 62.8, "epoch": 0.3242424242424242, "grad_norm": 1.2815499305725098, "kl": 0.03230362720787525, "learning_rate": 3.7666395230724878e-06, "loss": 0.007893875241279602, "memory(GiB)": 69.34, "reward": 0.8706652283668518, "reward_std": 0.45855966210365295, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.1956652283668518, "rewards/ReportKG_Jaccard/std": 0.06293774396181107, "step": 1605, "train_speed(iter/s)": 0.037362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.6, "completions/mean_length": 63.775, "completions/min_length": 47.6, "epoch": 0.32525252525252524, "grad_norm": 1.813722014427185, "kl": 0.050717313587665555, "learning_rate": 3.762906726895164e-06, "loss": 0.09392762184143066, "memory(GiB)": 69.34, "reward": 0.7986274003982544, "reward_std": 0.3196102365851402, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.1736274003982544, "rewards/ReportKG_Jaccard/std": 0.0668108694255352, "step": 1610, "train_speed(iter/s)": 0.037399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.6, "completions/mean_length": 70.95, "completions/min_length": 49.0, "epoch": 0.32626262626262625, "grad_norm": 1.2808105945587158, "kl": 0.03890492543578148, "learning_rate": 3.7591461925259833e-06, "loss": 0.05560450553894043, "memory(GiB)": 69.34, "reward": 0.5464693814516067, "reward_std": 0.39526058584451673, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.19646938443183898, "rewards/ReportKG_Jaccard/std": 0.08203914165496826, "step": 1615, "train_speed(iter/s)": 0.037422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.8, "completions/mean_length": 66.575, "completions/min_length": 48.2, "epoch": 0.32727272727272727, "grad_norm": 1.7410637140274048, "kl": 0.03014047220349312, "learning_rate": 3.755357979134511e-06, "loss": 0.005027064681053161, "memory(GiB)": 69.34, "reward": 0.7985595643520356, "reward_std": 0.46112933158874514, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.38124017119407655, "rewards/ReportKG_Jaccard/mean": 0.2485595315694809, "rewards/ReportKG_Jaccard/std": 0.0971069049090147, "step": 1620, "train_speed(iter/s)": 0.03744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.4, "completions/mean_length": 71.15, "completions/min_length": 52.0, "epoch": 0.3282828282828283, "grad_norm": 1.3484776020050049, "kl": 0.030643395707011224, "learning_rate": 3.751542146325823e-06, "loss": 0.07132894992828369, "memory(GiB)": 69.34, "reward": 0.9562482953071594, "reward_std": 0.38918532282114027, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.2312483087182045, "rewards/ReportKG_Jaccard/std": 0.08239089697599411, "step": 1625, "train_speed(iter/s)": 0.037425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.4, "completions/mean_length": 66.875, "completions/min_length": 53.2, "epoch": 0.3292929292929293, "grad_norm": 0.9545879364013672, "kl": 0.035918372124433516, "learning_rate": 3.74769875413957e-06, "loss": -0.024291862547397614, "memory(GiB)": 69.34, "reward": 0.8929901719093323, "reward_std": 0.32817584201693534, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.24299019426107407, "rewards/ReportKG_Jaccard/std": 0.08409973680973053, "step": 1630, "train_speed(iter/s)": 0.037461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 76.5, "completions/min_length": 57.2, "epoch": 0.3303030303030303, "grad_norm": 1.4883517026901245, "kl": 0.0459111675620079, "learning_rate": 3.743827863049029e-06, "loss": 0.05112986564636231, "memory(GiB)": 69.34, "reward": 0.41179706156253815, "reward_std": 0.20526546463370324, "rewards/MultiModalAccuracyORM_Any/mean": 0.2, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.21179708540439607, "rewards/ReportKG_Jaccard/std": 0.07928667813539506, "step": 1635, "train_speed(iter/s)": 0.037419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.6, "completions/mean_length": 77.25, "completions/min_length": 57.0, "epoch": 0.33131313131313134, "grad_norm": 1.161195158958435, "kl": 0.04504235461354256, "learning_rate": 3.739929533960158e-06, "loss": 0.01668713390827179, "memory(GiB)": 69.34, "reward": 0.8829541802406311, "reward_std": 0.2794369161128998, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.19948650598526002, "rewards/ReportKG_Jaccard/mean": 0.2329541712999344, "rewards/ReportKG_Jaccard/std": 0.09155429378151894, "step": 1640, "train_speed(iter/s)": 0.037396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.0, "completions/mean_length": 70.725, "completions/min_length": 54.0, "epoch": 0.3323232323232323, "grad_norm": 2.0616955757141113, "kl": 0.03163882754743099, "learning_rate": 3.7360038282106306e-06, "loss": 0.04136488437652588, "memory(GiB)": 69.34, "reward": 0.7070938944816589, "reward_std": 0.38157473504543304, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.13209389746189118, "rewards/ReportKG_Jaccard/std": 0.05861420668661595, "step": 1645, "train_speed(iter/s)": 0.037407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 74.025, "completions/min_length": 57.0, "epoch": 0.3333333333333333, "grad_norm": 1.1284866333007812, "kl": 0.03008382357656956, "learning_rate": 3.7320508075688773e-06, "loss": 0.10025790929794312, "memory(GiB)": 69.34, "reward": 1.0930436372756958, "reward_std": 0.41544715240597724, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.29304363429546354, "rewards/ReportKG_Jaccard/std": 0.07095232103019952, "step": 1650, "train_speed(iter/s)": 0.03743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.2, "completions/mean_length": 76.2, "completions/min_length": 56.0, "epoch": 0.33434343434343433, "grad_norm": 1.5210316181182861, "kl": 0.026165328174829482, "learning_rate": 3.728070534233108e-06, "loss": -0.041692861914634706, "memory(GiB)": 69.34, "reward": 0.7443605124950409, "reward_std": 0.42613563537597654, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.24436051547527313, "rewards/ReportKG_Jaccard/std": 0.07135704159736633, "step": 1655, "train_speed(iter/s)": 0.037407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.2, "completions/mean_length": 68.15, "completions/min_length": 50.0, "epoch": 0.33535353535353535, "grad_norm": 1.3370482921600342, "kl": 0.048917827755212785, "learning_rate": 3.7240630708303385e-06, "loss": 0.0418353408575058, "memory(GiB)": 69.34, "reward": 0.9669307112693787, "reward_std": 0.3061853587627411, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.19948650598526002, "rewards/ReportKG_Jaccard/mean": 0.2169307142496109, "rewards/ReportKG_Jaccard/std": 0.11218863651156426, "step": 1660, "train_speed(iter/s)": 0.037445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.2, "completions/mean_length": 66.2, "completions/min_length": 49.4, "epoch": 0.33636363636363636, "grad_norm": 1.351498007774353, "kl": 0.048984797671437263, "learning_rate": 3.7200284804154006e-06, "loss": 0.048977726697921754, "memory(GiB)": 69.34, "reward": 0.5843734443187714, "reward_std": 0.32308867275714875, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.1843734323978424, "rewards/ReportKG_Jaccard/std": 0.0900352880358696, "step": 1665, "train_speed(iter/s)": 0.03744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.2, "completions/mean_length": 77.925, "completions/min_length": 51.4, "epoch": 0.3373737373737374, "grad_norm": 1.2935847043991089, "kl": 0.06426319442689418, "learning_rate": 3.715966826469954e-06, "loss": -0.05803241729736328, "memory(GiB)": 69.34, "reward": 0.8262804269790649, "reward_std": 0.30612880140542986, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.12628042995929717, "rewards/ReportKG_Jaccard/std": 0.06774854809045791, "step": 1670, "train_speed(iter/s)": 0.037424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 70.45, "completions/min_length": 53.6, "epoch": 0.3383838383838384, "grad_norm": 1.2818750143051147, "kl": 0.053863152861595154, "learning_rate": 3.711878172901483e-06, "loss": 0.029649150371551514, "memory(GiB)": 69.34, "reward": 0.8304353713989258, "reward_std": 0.356648151576519, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.2845196664333344, "rewards/ReportKG_Jaccard/mean": 0.2554353401064873, "rewards/ReportKG_Jaccard/std": 0.08726863712072372, "step": 1675, "train_speed(iter/s)": 0.037434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.4, "completions/mean_length": 67.875, "completions/min_length": 56.8, "epoch": 0.3393939393939394, "grad_norm": 1.799034595489502, "kl": 0.05410161912441254, "learning_rate": 3.707762584042296e-06, "loss": 0.04065687954425812, "memory(GiB)": 69.34, "reward": 1.091972541809082, "reward_std": 0.3592404291033745, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.24197252690792084, "rewards/ReportKG_Jaccard/std": 0.06539239585399628, "step": 1680, "train_speed(iter/s)": 0.037455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.8, "completions/mean_length": 71.975, "completions/min_length": 54.6, "epoch": 0.34040404040404043, "grad_norm": 1.4518096446990967, "kl": 0.027075917273759843, "learning_rate": 3.703620124648509e-06, "loss": 0.024485501646995544, "memory(GiB)": 69.34, "reward": 0.6809945642948151, "reward_std": 0.2551219016313553, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.23099454045295714, "rewards/ReportKG_Jaccard/std": 0.08777046836912632, "step": 1685, "train_speed(iter/s)": 0.037435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.0, "completions/mean_length": 89.5, "completions/min_length": 67.8, "epoch": 0.3414141414141414, "grad_norm": 1.0735009908676147, "kl": 0.047069627419114114, "learning_rate": 3.699450859899029e-06, "loss": -0.037794163823127745, "memory(GiB)": 69.34, "reward": 0.9359557390213012, "reward_std": 0.4071000710129738, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.38463483452796937, "rewards/ReportKG_Jaccard/mean": 0.21095572113990785, "rewards/ReportKG_Jaccard/std": 0.056156881526112555, "step": 1690, "train_speed(iter/s)": 0.037449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.4, "completions/mean_length": 71.275, "completions/min_length": 47.6, "epoch": 0.3424242424242424, "grad_norm": 1.4955190420150757, "kl": 0.04173090234398842, "learning_rate": 3.695254855394527e-06, "loss": 0.08269641399383545, "memory(GiB)": 69.34, "reward": 0.8987454950809479, "reward_std": 0.325892648845911, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.1987454891204834, "rewards/ReportKG_Jaccard/std": 0.07133635506033897, "step": 1695, "train_speed(iter/s)": 0.037482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/mean_length": 62.875, "completions/min_length": 45.2, "epoch": 0.3434343434343434, "grad_norm": 1.287684679031372, "kl": 0.05130814202129841, "learning_rate": 3.6910321771564085e-06, "loss": 0.038499081134796144, "memory(GiB)": 69.34, "reward": 1.1153439283370972, "reward_std": 0.39916523098945617, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.29034392833709716, "rewards/ReportKG_Jaccard/std": 0.10462931469082833, "step": 1700, "train_speed(iter/s)": 0.037499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.6, "completions/mean_length": 72.475, "completions/min_length": 56.8, "epoch": 0.34444444444444444, "grad_norm": 1.1744954586029053, "kl": 0.03314276933670044, "learning_rate": 3.6867828916257713e-06, "loss": 0.049308490753173825, "memory(GiB)": 69.34, "reward": 1.011796772480011, "reward_std": 0.26446624994277956, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.23679674565792083, "rewards/ReportKG_Jaccard/std": 0.08763740360736846, "step": 1705, "train_speed(iter/s)": 0.037513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 63.25, "completions/min_length": 49.4, "epoch": 0.34545454545454546, "grad_norm": 1.2759398221969604, "kl": 0.0503157339990139, "learning_rate": 3.6825070656623624e-06, "loss": 0.10703715085983276, "memory(GiB)": 69.34, "reward": 1.0770387172698974, "reward_std": 0.3116839602589607, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.22703870236873627, "rewards/ReportKG_Jaccard/std": 0.07000982686877251, "step": 1710, "train_speed(iter/s)": 0.037531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.2, "completions/mean_length": 77.125, "completions/min_length": 57.0, "epoch": 0.3464646464646465, "grad_norm": 1.390560507774353, "kl": 0.04130200706422329, "learning_rate": 3.6782047665435244e-06, "loss": 0.021171629428863525, "memory(GiB)": 69.34, "reward": 0.998974621295929, "reward_std": 0.31422490924596785, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.28112500309944155, "rewards/ReportKG_Jaccard/mean": 0.19897463470697402, "rewards/ReportKG_Jaccard/std": 0.05289416648447513, "step": 1715, "train_speed(iter/s)": 0.037544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 73.625, "completions/min_length": 56.0, "epoch": 0.3474747474747475, "grad_norm": 1.0303878784179688, "kl": 0.02596118412911892, "learning_rate": 3.673876061963139e-06, "loss": 0.0006847160402685404, "memory(GiB)": 69.34, "reward": 0.5304444849491119, "reward_std": 0.4604190945625305, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.1554444834589958, "rewards/ReportKG_Jaccard/std": 0.05091646052896977, "step": 1720, "train_speed(iter/s)": 0.037505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.2, "completions/mean_length": 76.55, "completions/min_length": 60.4, "epoch": 0.3484848484848485, "grad_norm": 1.3168240785598755, "kl": 0.03299897164106369, "learning_rate": 3.6695210200305603e-06, "loss": 0.011545738577842713, "memory(GiB)": 69.34, "reward": 0.9046691179275512, "reward_std": 0.27558058947324754, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.20466911792755127, "rewards/ReportKG_Jaccard/std": 0.07645663022994995, "step": 1725, "train_speed(iter/s)": 0.037516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.6, "completions/mean_length": 83.2, "completions/min_length": 62.2, "epoch": 0.34949494949494947, "grad_norm": 1.3525621891021729, "kl": 0.07506313472986222, "learning_rate": 3.665139709269542e-06, "loss": 0.008815070986747742, "memory(GiB)": 69.34, "reward": 0.7356889307498932, "reward_std": 0.42016426026821135, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.3880294978618622, "rewards/ReportKG_Jaccard/mean": 0.18568892776966095, "rewards/ReportKG_Jaccard/std": 0.06300879716873169, "step": 1730, "train_speed(iter/s)": 0.037547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.0, "completions/mean_length": 70.325, "completions/min_length": 54.6, "epoch": 0.3505050505050505, "grad_norm": 2.0476019382476807, "kl": 0.04934939853847027, "learning_rate": 3.660732198617165e-06, "loss": -0.023042207956314086, "memory(GiB)": 69.34, "reward": 0.777572762966156, "reward_std": 0.4266380026936531, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.3812558650970459, "rewards/ReportKG_Jaccard/mean": 0.25257275104522703, "rewards/ReportKG_Jaccard/std": 0.06827771961688996, "step": 1735, "train_speed(iter/s)": 0.037565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.4, "completions/mean_length": 72.5, "completions/min_length": 45.2, "epoch": 0.3515151515151515, "grad_norm": 1.4819860458374023, "kl": 0.04321167767047882, "learning_rate": 3.6562985574227412e-06, "loss": 0.061064934730529784, "memory(GiB)": 69.34, "reward": 0.6712871491909027, "reward_std": 0.4019786689430475, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3703280448913574, "rewards/ReportKG_Jaccard/mean": 0.17128715366125108, "rewards/ReportKG_Jaccard/std": 0.06330619491636753, "step": 1740, "train_speed(iter/s)": 0.037493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.6, "completions/mean_length": 75.225, "completions/min_length": 57.8, "epoch": 0.3525252525252525, "grad_norm": 1.011015772819519, "kl": 0.03830952197313309, "learning_rate": 3.651838855446738e-06, "loss": -0.02295074462890625, "memory(GiB)": 69.34, "reward": 0.7813801854848862, "reward_std": 0.3785560607910156, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.23138017654418946, "rewards/ReportKG_Jaccard/std": 0.04170529581606388, "step": 1745, "train_speed(iter/s)": 0.037516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.6, "completions/mean_length": 75.05, "completions/min_length": 53.0, "epoch": 0.35353535353535354, "grad_norm": 1.191598892211914, "kl": 0.037607663124799726, "learning_rate": 3.6473531628596653e-06, "loss": 0.007802788913249969, "memory(GiB)": 69.34, "reward": 0.8318883836269378, "reward_std": 0.1949826419353485, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.2318884015083313, "rewards/ReportKG_Jaccard/std": 0.06633443608880044, "step": 1750, "train_speed(iter/s)": 0.037516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.0, "completions/mean_length": 66.15, "completions/min_length": 48.6, "epoch": 0.35454545454545455, "grad_norm": 1.4165409803390503, "kl": 0.02585367448627949, "learning_rate": 3.6428415502409832e-06, "loss": 0.0782991111278534, "memory(GiB)": 69.34, "reward": 0.8700188428163529, "reward_std": 0.27693274468183515, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.24501883685588838, "rewards/ReportKG_Jaccard/std": 0.0878092497587204, "step": 1755, "train_speed(iter/s)": 0.037507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.2, "completions/mean_length": 78.075, "completions/min_length": 55.8, "epoch": 0.35555555555555557, "grad_norm": 1.410719394683838, "kl": 0.03040403835475445, "learning_rate": 3.6383040885779835e-06, "loss": 0.04772518873214722, "memory(GiB)": 69.34, "reward": 0.82648064494133, "reward_std": 0.5129022300243378, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.45196654200553893, "rewards/ReportKG_Jaccard/mean": 0.2764806360006332, "rewards/ReportKG_Jaccard/std": 0.10535919666290283, "step": 1760, "train_speed(iter/s)": 0.037489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.4, "completions/mean_length": 80.15, "completions/min_length": 60.4, "epoch": 0.3565656565656566, "grad_norm": 1.1993448734283447, "kl": 0.04232686087489128, "learning_rate": 3.6337408492646773e-06, "loss": -0.001692010462284088, "memory(GiB)": 69.34, "reward": 0.8840619921684265, "reward_std": 0.3775011420249939, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.31392415761947634, "rewards/ReportKG_Jaccard/mean": 0.18406201153993607, "rewards/ReportKG_Jaccard/std": 0.0791990615427494, "step": 1765, "train_speed(iter/s)": 0.037485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.0, "completions/mean_length": 66.425, "completions/min_length": 46.8, "epoch": 0.3575757575757576, "grad_norm": 1.4109045267105103, "kl": 0.030171828344464302, "learning_rate": 3.6291519041006715e-06, "loss": 0.02648068368434906, "memory(GiB)": 69.34, "reward": 1.0031123399734496, "reward_std": 0.2377036392688751, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.27811232954263687, "rewards/ReportKG_Jaccard/std": 0.08738096654415131, "step": 1770, "train_speed(iter/s)": 0.037491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.8, "completions/mean_length": 75.175, "completions/min_length": 60.2, "epoch": 0.35858585858585856, "grad_norm": 1.003345251083374, "kl": 0.041795763000845906, "learning_rate": 3.6245373252900346e-06, "loss": 0.041649529337883, "memory(GiB)": 69.34, "reward": 0.8875355005264283, "reward_std": 0.3690749317407608, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.28112500309944155, "rewards/ReportKG_Jaccard/mean": 0.2875354826450348, "rewards/ReportKG_Jaccard/std": 0.10197630524635315, "step": 1775, "train_speed(iter/s)": 0.037507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.0, "completions/mean_length": 78.7, "completions/min_length": 55.4, "epoch": 0.3595959595959596, "grad_norm": 1.2431625127792358, "kl": 0.04320463724434376, "learning_rate": 3.619897185440167e-06, "loss": -0.01307528167963028, "memory(GiB)": 69.34, "reward": 0.9829161942005158, "reward_std": 0.2469934731721878, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.20791617557406425, "rewards/ReportKG_Jaccard/std": 0.05315328799188137, "step": 1780, "train_speed(iter/s)": 0.037497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.8, "completions/mean_length": 72.0, "completions/min_length": 54.2, "epoch": 0.3606060606060606, "grad_norm": 1.0042781829833984, "kl": 0.04704674631357193, "learning_rate": 3.615231557560653e-06, "loss": -0.02314397245645523, "memory(GiB)": 69.34, "reward": 0.7217396259307861, "reward_std": 0.35872538983821867, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.19673963263630867, "rewards/ReportKG_Jaccard/std": 0.049962335452437404, "step": 1785, "train_speed(iter/s)": 0.037516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 73.325, "completions/min_length": 57.4, "epoch": 0.3616161616161616, "grad_norm": 1.4377641677856445, "kl": 0.04775975421071053, "learning_rate": 3.6105405150621173e-06, "loss": 0.027398866415023804, "memory(GiB)": 69.34, "reward": 0.8767862379550934, "reward_std": 0.33516086637973785, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.25178624987602233, "rewards/ReportKG_Jaccard/std": 0.06764857172966003, "step": 1790, "train_speed(iter/s)": 0.037546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 79.4, "completions/mean_length": 62.05, "completions/min_length": 48.4, "epoch": 0.36262626262626263, "grad_norm": 1.302555799484253, "kl": 0.046427908167243, "learning_rate": 3.6058241317550636e-06, "loss": 0.09501888155937195, "memory(GiB)": 69.34, "reward": 1.0671864748001099, "reward_std": 0.2692760184407234, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.2921864688396454, "rewards/ReportKG_Jaccard/std": 0.08033828884363174, "step": 1795, "train_speed(iter/s)": 0.037571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.0, "completions/mean_length": 69.125, "completions/min_length": 50.8, "epoch": 0.36363636363636365, "grad_norm": 1.7402452230453491, "kl": 0.04263134822249413, "learning_rate": 3.6010824818487207e-06, "loss": 0.043149504065513614, "memory(GiB)": 69.34, "reward": 0.81054328083992, "reward_std": 0.2920264393091202, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.21054329574108124, "rewards/ReportKG_Jaccard/std": 0.06897656470537186, "step": 1800, "train_speed(iter/s)": 0.037593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 70.575, "completions/min_length": 50.8, "epoch": 0.36464646464646466, "grad_norm": 1.4377686977386475, "kl": 0.043743930757045746, "learning_rate": 3.5963156399498677e-06, "loss": 0.02885291576385498, "memory(GiB)": 69.34, "reward": 0.8324194312095642, "reward_std": 0.2676810324192047, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.25741942673921586, "rewards/ReportKG_Jaccard/std": 0.08497763872146606, "step": 1805, "train_speed(iter/s)": 0.037561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.8, "completions/mean_length": 68.1, "completions/min_length": 48.2, "epoch": 0.3656565656565657, "grad_norm": 1.6691758632659912, "kl": 0.03659243956208229, "learning_rate": 3.591523681061664e-06, "loss": 0.01981653571128845, "memory(GiB)": 69.34, "reward": 0.7119343101978302, "reward_std": 0.15872237905859948, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.10350983142852783, "rewards/ReportKG_Jaccard/mean": 0.18693431168794633, "rewards/ReportKG_Jaccard/std": 0.059672094881534576, "step": 1810, "train_speed(iter/s)": 0.037538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.6, "completions/mean_length": 76.3, "completions/min_length": 57.4, "epoch": 0.36666666666666664, "grad_norm": 1.3562873601913452, "kl": 0.043228307366371156, "learning_rate": 3.5867066805824702e-06, "loss": 0.02509947419166565, "memory(GiB)": 69.34, "reward": 0.9318358898162842, "reward_std": 0.35293610841035844, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.30683593451976776, "rewards/ReportKG_Jaccard/std": 0.07968738898634911, "step": 1815, "train_speed(iter/s)": 0.037551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 110.8, "completions/mean_length": 82.425, "completions/min_length": 62.0, "epoch": 0.36767676767676766, "grad_norm": 2.055974245071411, "kl": 0.03840247727930546, "learning_rate": 3.5818647143046583e-06, "loss": 0.02983747124671936, "memory(GiB)": 69.34, "reward": 0.9231750726699829, "reward_std": 0.2965607509016991, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.1981750726699829, "rewards/ReportKG_Jaccard/std": 0.06886626668274402, "step": 1820, "train_speed(iter/s)": 0.037553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.2, "completions/mean_length": 69.375, "completions/min_length": 47.8, "epoch": 0.3686868686868687, "grad_norm": 1.3454563617706299, "kl": 0.03808989152312279, "learning_rate": 3.5769978584134214e-06, "loss": 0.013560348749160766, "memory(GiB)": 69.34, "reward": 0.9035352051258088, "reward_std": 0.23320495560765267, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.25353520959615705, "rewards/ReportKG_Jaccard/std": 0.07379620522260666, "step": 1825, "train_speed(iter/s)": 0.037522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.2, "completions/mean_length": 73.95, "completions/min_length": 59.2, "epoch": 0.3696969696969697, "grad_norm": 1.2823184728622437, "kl": 0.03211322017014027, "learning_rate": 3.5721061894855747e-06, "loss": 0.04599677324295044, "memory(GiB)": 69.34, "reward": 0.8296832382678986, "reward_std": 0.23795376121997833, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.2796832174062729, "rewards/ReportKG_Jaccard/std": 0.10704943388700486, "step": 1830, "train_speed(iter/s)": 0.037539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.0, "completions/mean_length": 82.475, "completions/min_length": 61.6, "epoch": 0.3707070707070707, "grad_norm": 1.275458574295044, "kl": 0.03759850710630417, "learning_rate": 3.5671897844883506e-06, "loss": 0.044387584924697875, "memory(GiB)": 69.34, "reward": 0.5028300017118454, "reward_std": 0.36962263733148576, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.20283000022172928, "rewards/ReportKG_Jaccard/std": 0.07870891019701957, "step": 1835, "train_speed(iter/s)": 0.037507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.6, "completions/mean_length": 73.4, "completions/min_length": 58.4, "epoch": 0.3717171717171717, "grad_norm": 1.4396588802337646, "kl": 0.04751861765980721, "learning_rate": 3.5622487207781866e-06, "loss": -0.024547260999679566, "memory(GiB)": 69.34, "reward": 0.9573305487632752, "reward_std": 0.37010217010974883, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.282330584526062, "rewards/ReportKG_Jaccard/std": 0.06677078232169151, "step": 1840, "train_speed(iter/s)": 0.037489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 78.925, "completions/min_length": 63.2, "epoch": 0.37272727272727274, "grad_norm": 1.0561749935150146, "kl": 0.04575972482562065, "learning_rate": 3.55728307609951e-06, "loss": 0.023332352936267852, "memory(GiB)": 69.34, "reward": 0.7096945405006408, "reward_std": 0.4463078320026398, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.28469454050064086, "rewards/ReportKG_Jaccard/std": 0.07048648968338966, "step": 1845, "train_speed(iter/s)": 0.037499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 78.625, "completions/min_length": 56.8, "epoch": 0.37373737373737376, "grad_norm": 1.2810674905776978, "kl": 0.03430557548999787, "learning_rate": 3.5522929285835134e-06, "loss": 0.04576590657234192, "memory(GiB)": 69.34, "reward": 0.8009779274463653, "reward_std": 0.4620295107364655, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.22597794383764266, "rewards/ReportKG_Jaccard/std": 0.0779692880809307, "step": 1850, "train_speed(iter/s)": 0.037491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/mean_length": 74.1, "completions/min_length": 52.6, "epoch": 0.3747474747474748, "grad_norm": 1.2078654766082764, "kl": 0.040298032015562056, "learning_rate": 3.5472783567469257e-06, "loss": -0.007449162006378174, "memory(GiB)": 69.34, "reward": 0.8728307902812957, "reward_std": 0.36443296521902085, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.19783080220222474, "rewards/ReportKG_Jaccard/std": 0.06970254853367805, "step": 1855, "train_speed(iter/s)": 0.037506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.6, "completions/mean_length": 72.15, "completions/min_length": 59.8, "epoch": 0.37575757575757573, "grad_norm": 1.3534358739852905, "kl": 0.03271715305745602, "learning_rate": 3.542239439490775e-06, "loss": 0.0003305405378341675, "memory(GiB)": 69.34, "reward": 0.5422607421875, "reward_std": 0.4729518353939056, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.4553455114364624, "rewards/ReportKG_Jaccard/mean": 0.24226074516773224, "rewards/ReportKG_Jaccard/std": 0.04980861060321331, "step": 1860, "train_speed(iter/s)": 0.037531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.8, "completions/mean_length": 70.625, "completions/min_length": 51.8, "epoch": 0.37676767676767675, "grad_norm": 2.122056007385254, "kl": 0.040571126341819766, "learning_rate": 3.5371762560991516e-06, "loss": 0.043534889817237854, "memory(GiB)": 69.34, "reward": 0.6859071731567383, "reward_std": 0.3703291490674019, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.18590715825557708, "rewards/ReportKG_Jaccard/std": 0.07679005190730095, "step": 1865, "train_speed(iter/s)": 0.037552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.0, "completions/mean_length": 83.525, "completions/min_length": 57.0, "epoch": 0.37777777777777777, "grad_norm": 1.4569871425628662, "kl": 0.04127707555890083, "learning_rate": 3.532088886237956e-06, "loss": -0.04530249834060669, "memory(GiB)": 69.34, "reward": 0.7712965250015259, "reward_std": 0.437159937620163, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4191516935825348, "rewards/ReportKG_Jaccard/mean": 0.19629652798175812, "rewards/ReportKG_Jaccard/std": 0.05225940868258476, "step": 1870, "train_speed(iter/s)": 0.037569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 113.2, "completions/mean_length": 80.05, "completions/min_length": 59.8, "epoch": 0.3787878787878788, "grad_norm": 1.2480729818344116, "kl": 0.026008127629756926, "learning_rate": 3.526977409953647e-06, "loss": 0.06918636560440064, "memory(GiB)": 69.34, "reward": 0.7825409173965454, "reward_std": 0.4477749586105347, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.40650616884231566, "rewards/ReportKG_Jaccard/mean": 0.1825409084558487, "rewards/ReportKG_Jaccard/std": 0.07147345915436745, "step": 1875, "train_speed(iter/s)": 0.037564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 119.0, "completions/mean_length": 89.375, "completions/min_length": 61.4, "epoch": 0.3797979797979798, "grad_norm": 1.0197334289550781, "kl": 0.03395793326199055, "learning_rate": 3.521841907671983e-06, "loss": 0.013477955758571625, "memory(GiB)": 69.34, "reward": 0.6630441665649414, "reward_std": 0.40828312635421754, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.2130441978573799, "rewards/ReportKG_Jaccard/std": 0.06549729816615582, "step": 1880, "train_speed(iter/s)": 0.037573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.0, "completions/mean_length": 82.05, "completions/min_length": 59.0, "epoch": 0.3808080808080808, "grad_norm": 0.9506113529205322, "kl": 0.03232410810887813, "learning_rate": 3.5166824601967548e-06, "loss": -0.024375975131988525, "memory(GiB)": 69.34, "reward": 0.8532946646213532, "reward_std": 0.3746455103158951, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.20329467654228212, "rewards/ReportKG_Jaccard/std": 0.05326524302363396, "step": 1885, "train_speed(iter/s)": 0.037591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.0, "completions/mean_length": 76.175, "completions/min_length": 63.0, "epoch": 0.38181818181818183, "grad_norm": 1.0845324993133545, "kl": 0.04199531003832817, "learning_rate": 3.5114991487085164e-06, "loss": 0.04803017973899841, "memory(GiB)": 69.34, "reward": 0.8959307789802551, "reward_std": 0.3799197345972061, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.27093076705932617, "rewards/ReportKG_Jaccard/std": 0.06438260339200497, "step": 1890, "train_speed(iter/s)": 0.037613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.4, "completions/mean_length": 68.35, "completions/min_length": 51.0, "epoch": 0.38282828282828285, "grad_norm": 1.2852035760879517, "kl": 0.03793407194316387, "learning_rate": 3.5062920547633063e-06, "loss": 0.0600468635559082, "memory(GiB)": 69.34, "reward": 1.1520644426345825, "reward_std": 0.3507825702428818, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.3020644336938858, "rewards/ReportKG_Jaccard/std": 0.061323092132806775, "step": 1895, "train_speed(iter/s)": 0.037637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.6, "completions/mean_length": 71.025, "completions/min_length": 49.4, "epoch": 0.3838383838383838, "grad_norm": 1.5149401426315308, "kl": 0.019615575298666955, "learning_rate": 3.5010612602913644e-06, "loss": 0.009544944763183594, "memory(GiB)": 69.34, "reward": 0.7536465942859649, "reward_std": 0.3648734800517559, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.22864657640457153, "rewards/ReportKG_Jaccard/std": 0.07617845609784127, "step": 1900, "train_speed(iter/s)": 0.037612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.2, "completions/mean_length": 75.35, "completions/min_length": 59.8, "epoch": 0.38484848484848483, "grad_norm": 1.2859512567520142, "kl": 0.05332225486636162, "learning_rate": 3.495806847595842e-06, "loss": 0.07927954196929932, "memory(GiB)": 69.34, "reward": 1.0691228806972504, "reward_std": 0.3152169965207577, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.31912284195423124, "rewards/ReportKG_Jaccard/std": 0.10325354672968387, "step": 1905, "train_speed(iter/s)": 0.037616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.8, "completions/mean_length": 81.125, "completions/min_length": 55.2, "epoch": 0.38585858585858585, "grad_norm": 1.3185871839523315, "kl": 0.024861627817153932, "learning_rate": 3.4905288993515096e-06, "loss": 0.05956749320030212, "memory(GiB)": 69.34, "reward": 0.9032783389091492, "reward_std": 0.3890251576900482, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.2782783478498459, "rewards/ReportKG_Jaccard/std": 0.09173431470990182, "step": 1910, "train_speed(iter/s)": 0.037625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.4, "completions/mean_length": 59.05, "completions/min_length": 43.4, "epoch": 0.38686868686868686, "grad_norm": 1.4317119121551514, "kl": 0.037638626992702484, "learning_rate": 3.4852274986034526e-06, "loss": 0.017843037843704224, "memory(GiB)": 69.34, "reward": 0.8158306360244751, "reward_std": 0.3942592069506645, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.16583064049482346, "rewards/ReportKG_Jaccard/std": 0.0593066144734621, "step": 1915, "train_speed(iter/s)": 0.03766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.6, "completions/mean_length": 67.4, "completions/min_length": 51.6, "epoch": 0.3878787878787879, "grad_norm": 1.281331181526184, "kl": 0.03136465772986412, "learning_rate": 3.479902728765768e-06, "loss": 0.03240785300731659, "memory(GiB)": 69.34, "reward": 0.9262050211429596, "reward_std": 0.33269758969545365, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.22620502412319182, "rewards/ReportKG_Jaccard/std": 0.07732478156685829, "step": 1920, "train_speed(iter/s)": 0.037695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/mean_length": 73.6, "completions/min_length": 54.8, "epoch": 0.3888888888888889, "grad_norm": 1.1796311140060425, "kl": 0.029582563787698746, "learning_rate": 3.474554673620248e-06, "loss": 0.024609294533729554, "memory(GiB)": 69.34, "reward": 0.49739490151405336, "reward_std": 0.35862505733966826, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.2886738538742065, "rewards/ReportKG_Jaccard/mean": 0.22239490151405333, "rewards/ReportKG_Jaccard/std": 0.0914742186665535, "step": 1925, "train_speed(iter/s)": 0.037679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.6, "completions/mean_length": 68.575, "completions/min_length": 53.2, "epoch": 0.3898989898989899, "grad_norm": 1.3102878332138062, "kl": 0.0372127290815115, "learning_rate": 3.4691834173150662e-06, "loss": -0.03530659973621368, "memory(GiB)": 69.34, "reward": 0.9390535473823547, "reward_std": 0.3289530977606773, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.2886738538742065, "rewards/ReportKG_Jaccard/mean": 0.26405355930328367, "rewards/ReportKG_Jaccard/std": 0.06297316066920758, "step": 1930, "train_speed(iter/s)": 0.037667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.4, "completions/mean_length": 64.55, "completions/min_length": 43.2, "epoch": 0.39090909090909093, "grad_norm": 1.8521044254302979, "kl": 0.04214425608515739, "learning_rate": 3.4637890443634507e-06, "loss": 0.0963703453540802, "memory(GiB)": 69.34, "reward": 0.8611887335777283, "reward_std": 0.3796192228794098, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.23618872165679933, "rewards/ReportKG_Jaccard/std": 0.07189608961343766, "step": 1935, "train_speed(iter/s)": 0.037686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 111.2, "completions/mean_length": 81.65, "completions/min_length": 61.6, "epoch": 0.39191919191919194, "grad_norm": 0.9576144218444824, "kl": 0.03682507276535034, "learning_rate": 3.458371639642354e-06, "loss": -0.010075516998767853, "memory(GiB)": 69.34, "reward": 1.1117460012435914, "reward_std": 0.359600818157196, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.26174601912498474, "rewards/ReportKG_Jaccard/std": 0.06970723196864129, "step": 1940, "train_speed(iter/s)": 0.037674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 74.25, "completions/min_length": 56.4, "epoch": 0.3929292929292929, "grad_norm": 3.6155993938446045, "kl": 0.04728824310004711, "learning_rate": 3.45293128839112e-06, "loss": -0.013107310235500335, "memory(GiB)": 69.34, "reward": 0.7047531306743622, "reward_std": 0.2558328688144684, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.20475310757756232, "rewards/ReportKG_Jaccard/std": 0.05700149349868298, "step": 1945, "train_speed(iter/s)": 0.037681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.0, "completions/mean_length": 74.4, "completions/min_length": 58.4, "epoch": 0.3939393939393939, "grad_norm": 1.0801608562469482, "kl": 0.03438417688012123, "learning_rate": 3.44746807621014e-06, "loss": -0.011360391974449158, "memory(GiB)": 69.34, "reward": 0.6299920558929444, "reward_std": 0.3943236365914345, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.22999203354120254, "rewards/ReportKG_Jaccard/std": 0.06994296088814736, "step": 1950, "train_speed(iter/s)": 0.037677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.8, "completions/mean_length": 73.25, "completions/min_length": 61.4, "epoch": 0.39494949494949494, "grad_norm": 1.0375927686691284, "kl": 0.02832186296582222, "learning_rate": 3.4419820890595083e-06, "loss": -0.0006882157176733017, "memory(GiB)": 69.34, "reward": 0.7842443645000458, "reward_std": 0.3331880494952202, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.25924434065818786, "rewards/ReportKG_Jaccard/std": 0.07997064366936683, "step": 1955, "train_speed(iter/s)": 0.037667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.6, "completions/mean_length": 69.775, "completions/min_length": 57.6, "epoch": 0.39595959595959596, "grad_norm": 1.1996389627456665, "kl": 0.046665864810347554, "learning_rate": 3.4364734132576683e-06, "loss": 0.038754045963287354, "memory(GiB)": 69.34, "reward": 1.096902894973755, "reward_std": 0.34666325002908704, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.2469028890132904, "rewards/ReportKG_Jaccard/std": 0.049552975594997405, "step": 1960, "train_speed(iter/s)": 0.037701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.2, "completions/mean_length": 67.225, "completions/min_length": 55.0, "epoch": 0.396969696969697, "grad_norm": 1.6418718099594116, "kl": 0.03193906769156456, "learning_rate": 3.4309421354800526e-06, "loss": 0.01316046416759491, "memory(GiB)": 69.34, "reward": 1.042246925830841, "reward_std": 0.37625750303268435, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.26724691689014435, "rewards/ReportKG_Jaccard/std": 0.13030841127038, "step": 1965, "train_speed(iter/s)": 0.037715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.6, "completions/mean_length": 72.025, "completions/min_length": 57.0, "epoch": 0.397979797979798, "grad_norm": 1.3088696002960205, "kl": 0.03543006405234337, "learning_rate": 3.4253883427577257e-06, "loss": 0.031393349170684814, "memory(GiB)": 69.34, "reward": 0.5047309994697571, "reward_std": 0.40967167913913727, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.2047309771180153, "rewards/ReportKG_Jaccard/std": 0.08267486467957497, "step": 1970, "train_speed(iter/s)": 0.037726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.4, "completions/mean_length": 76.275, "completions/min_length": 63.2, "epoch": 0.398989898989899, "grad_norm": 1.000829815864563, "kl": 0.08557871505618095, "learning_rate": 3.4198121224760054e-06, "loss": 0.047032099962234494, "memory(GiB)": 69.34, "reward": 1.0235365629196167, "reward_std": 0.3479070156812668, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.24853655099868774, "rewards/ReportKG_Jaccard/std": 0.04631008766591549, "step": 1975, "train_speed(iter/s)": 0.037675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 119.8, "completions/mean_length": 86.55, "completions/min_length": 65.6, "epoch": 0.4, "grad_norm": 1.1954832077026367, "kl": 0.03370936475694179, "learning_rate": 3.414213562373095e-06, "loss": -0.0409719318151474, "memory(GiB)": 69.34, "reward": 0.7025453329086304, "reward_std": 0.4104907110333443, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.2275453120470047, "rewards/ReportKG_Jaccard/std": 0.06127718277275562, "step": 1980, "train_speed(iter/s)": 0.037644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 71.0, "completions/min_length": 50.6, "epoch": 0.401010101010101, "grad_norm": 1.666964054107666, "kl": 0.042733832448720935, "learning_rate": 3.4085927505387e-06, "loss": 0.044077956676483156, "memory(GiB)": 69.34, "reward": 0.783547842502594, "reward_std": 0.44615113735198975, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.39729605317115785, "rewards/ReportKG_Jaccard/mean": 0.20854786336421965, "rewards/ReportKG_Jaccard/std": 0.07544957995414733, "step": 1985, "train_speed(iter/s)": 0.037653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.6, "completions/mean_length": 70.475, "completions/min_length": 57.2, "epoch": 0.402020202020202, "grad_norm": 1.9325385093688965, "kl": 0.02554580420255661, "learning_rate": 3.4029497754126426e-06, "loss": -0.05434509515762329, "memory(GiB)": 69.34, "reward": 0.5727274268865585, "reward_std": 0.45423188209533694, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.41403932571411134, "rewards/ReportKG_Jaccard/mean": 0.2227274477481842, "rewards/ReportKG_Jaccard/std": 0.08425540700554848, "step": 1990, "train_speed(iter/s)": 0.037638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.4, "completions/mean_length": 82.125, "completions/min_length": 65.4, "epoch": 0.403030303030303, "grad_norm": 0.9893093705177307, "kl": 0.04654711596667767, "learning_rate": 3.397284725783469e-06, "loss": 0.037932172417640686, "memory(GiB)": 69.34, "reward": 0.9313066959381103, "reward_std": 0.4012898176908493, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.20630667433142663, "rewards/ReportKG_Jaccard/std": 0.04180964883416891, "step": 1995, "train_speed(iter/s)": 0.037638 }, { "epoch": 0.40404040404040403, "grad_norm": 1.1393389701843262, "learning_rate": 3.391597690787055e-06, "loss": 0.023546113073825835, "memory(GiB)": 69.34, "step": 2000, "train_speed(iter/s)": 0.037626 }, { "epoch": 0.40404040404040403, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 101.74, "eval_completions/mean_length": 78.0, "eval_completions/min_length": 58.78, "eval_kl": 0.041802682019770145, "eval_loss": 0.008578057400882244, "eval_reward": 0.7132659693062305, "eval_reward_std": 0.29614583976566794, "eval_rewards/MultiModalAccuracyORM_Any/mean": 0.5125, "eval_rewards/MultiModalAccuracyORM_Any/std": 0.2528535896539688, "eval_rewards/ReportKG_Jaccard/mean": 0.2007659688591957, "eval_rewards/ReportKG_Jaccard/std": 0.06126864455640316, "eval_runtime": 915.3432, "eval_samples_per_second": 0.055, "eval_steps_per_second": 0.008, "step": 2000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.3, "completions/mean_length": 77.8375, "completions/min_length": 59.5, "epoch": 0.40505050505050505, "grad_norm": 1.2799369096755981, "kl": 0.0378800680860877, "learning_rate": 3.3858887599052004e-06, "loss": -0.02273252159357071, "memory(GiB)": 69.34, "reward": 0.8136347442865371, "reward_std": 0.39364954084157944, "rewards/MultiModalAccuracyORM_Any/mean": 0.5375, "rewards/MultiModalAccuracyORM_Any/std": 0.3484488636255264, "rewards/ReportKG_Jaccard/mean": 0.27613474130630494, "rewards/ReportKG_Jaccard/std": 0.08032092303037644, "step": 2005, "train_speed(iter/s)": 0.036953 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.0, "completions/mean_length": 78.975, "completions/min_length": 53.4, "epoch": 0.40606060606060607, "grad_norm": 0.8880646824836731, "kl": 0.07218711152672767, "learning_rate": 3.380158022964224e-06, "loss": 0.02707512378692627, "memory(GiB)": 69.34, "reward": 1.0176565170288085, "reward_std": 0.4228007435798645, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.41916738748550414, "rewards/ReportKG_Jaccard/mean": 0.21765652745962144, "rewards/ReportKG_Jaccard/std": 0.058187895268201825, "step": 2010, "train_speed(iter/s)": 0.036941 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.4, "completions/mean_length": 67.75, "completions/min_length": 49.4, "epoch": 0.4070707070707071, "grad_norm": 1.2242276668548584, "kl": 0.05051769576966762, "learning_rate": 3.374405570133547e-06, "loss": 0.06437984108924866, "memory(GiB)": 69.34, "reward": 0.8739655315876007, "reward_std": 0.2303403154015541, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.17396548315882682, "rewards/ReportKG_Jaccard/std": 0.0713625393807888, "step": 2015, "train_speed(iter/s)": 0.036948 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.6, "completions/mean_length": 84.25, "completions/min_length": 64.6, "epoch": 0.4080808080808081, "grad_norm": 1.534282922744751, "kl": 0.04742908664047718, "learning_rate": 3.3686314919242762e-06, "loss": 0.034516257047653195, "memory(GiB)": 69.34, "reward": 0.8240189671516418, "reward_std": 0.326160928606987, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.24901897162199021, "rewards/ReportKG_Jaccard/std": 0.0828708328306675, "step": 2020, "train_speed(iter/s)": 0.036946 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.6, "completions/mean_length": 76.075, "completions/min_length": 56.2, "epoch": 0.4090909090909091, "grad_norm": 1.1992114782333374, "kl": 0.036278778687119484, "learning_rate": 3.3628358791877826e-06, "loss": 0.03652783036231995, "memory(GiB)": 69.34, "reward": 1.0164274156093598, "reward_std": 0.2023513361811638, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.26642740070819854, "rewards/ReportKG_Jaccard/std": 0.07122243270277977, "step": 2025, "train_speed(iter/s)": 0.036946 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.0, "completions/mean_length": 70.225, "completions/min_length": 52.4, "epoch": 0.4101010101010101, "grad_norm": 0.9263531565666199, "kl": 0.04679432176053524, "learning_rate": 3.3570188231142643e-06, "loss": 0.008691610395908355, "memory(GiB)": 69.34, "reward": 0.8785491853952407, "reward_std": 0.2964409813284874, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.22854914963245393, "rewards/ReportKG_Jaccard/std": 0.06235882565379143, "step": 2030, "train_speed(iter/s)": 0.036927 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.2, "completions/mean_length": 71.85, "completions/min_length": 52.6, "epoch": 0.4111111111111111, "grad_norm": 1.3611764907836914, "kl": 0.031433761864900586, "learning_rate": 3.3511804152313205e-06, "loss": 0.007166276872158051, "memory(GiB)": 69.34, "reward": 0.9387526392936707, "reward_std": 0.33475171476602555, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.23875264823436737, "rewards/ReportKG_Jaccard/std": 0.09029015898704529, "step": 2035, "train_speed(iter/s)": 0.036945 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.0, "completions/mean_length": 73.775, "completions/min_length": 61.2, "epoch": 0.4121212121212121, "grad_norm": 1.588207721710205, "kl": 0.029716913029551505, "learning_rate": 3.3453207474025053e-06, "loss": 0.07349755167961121, "memory(GiB)": 69.34, "reward": 0.919830709695816, "reward_std": 0.310647089779377, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.24483070373535157, "rewards/ReportKG_Jaccard/std": 0.0822257373481989, "step": 2040, "train_speed(iter/s)": 0.036958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 70.425, "completions/min_length": 49.8, "epoch": 0.4131313131313131, "grad_norm": 1.5226994752883911, "kl": 0.05297303088009357, "learning_rate": 3.3394399118258836e-06, "loss": 0.07034488320350647, "memory(GiB)": 69.34, "reward": 1.217260718345642, "reward_std": 0.28359406590461733, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.29226069450378417, "rewards/ReportKG_Jaccard/std": 0.09911630861461163, "step": 2045, "train_speed(iter/s)": 0.036947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.8, "completions/mean_length": 78.925, "completions/min_length": 54.8, "epoch": 0.41414141414141414, "grad_norm": 1.1891471147537231, "kl": 0.05144170671701431, "learning_rate": 3.3335380010325833e-06, "loss": -0.002965361624956131, "memory(GiB)": 69.34, "reward": 0.8813567161560059, "reward_std": 0.30220722407102585, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2063567027449608, "rewards/ReportKG_Jaccard/std": 0.06761774495244026, "step": 2050, "train_speed(iter/s)": 0.036937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.8, "completions/mean_length": 76.325, "completions/min_length": 57.8, "epoch": 0.41515151515151516, "grad_norm": 1.5878015756607056, "kl": 0.027451307326555253, "learning_rate": 3.327615107885335e-06, "loss": 0.0010841310024261475, "memory(GiB)": 69.34, "reward": 0.5676001667976379, "reward_std": 0.4743816196918488, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.24260017275810242, "rewards/ReportKG_Jaccard/std": 0.0629837080836296, "step": 2055, "train_speed(iter/s)": 0.036951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 125.0, "completions/mean_length": 89.975, "completions/min_length": 63.4, "epoch": 0.4161616161616162, "grad_norm": 1.482743501663208, "kl": 0.026110623404383658, "learning_rate": 3.321671325577014e-06, "loss": -0.007742792367935181, "memory(GiB)": 69.34, "reward": 0.6681167840957641, "reward_std": 0.33786151707172396, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.16811676621437072, "rewards/ReportKG_Jaccard/std": 0.05141989886760712, "step": 2060, "train_speed(iter/s)": 0.03693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 80.075, "completions/min_length": 61.8, "epoch": 0.4171717171717172, "grad_norm": 1.333716630935669, "kl": 0.03917356990277767, "learning_rate": 3.315706747629173e-06, "loss": 0.03461475670337677, "memory(GiB)": 69.34, "reward": 0.8771398484706878, "reward_std": 0.30649760738015175, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2021398589015007, "rewards/ReportKG_Jaccard/std": 0.07362473160028457, "step": 2065, "train_speed(iter/s)": 0.036914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.4, "completions/mean_length": 76.35, "completions/min_length": 56.2, "epoch": 0.41818181818181815, "grad_norm": 1.6511272192001343, "kl": 0.048394615203142165, "learning_rate": 3.3097214678905703e-06, "loss": -0.007140126824378967, "memory(GiB)": 69.34, "reward": 0.9592708945274353, "reward_std": 0.33485492020845414, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.23427088260650636, "rewards/ReportKG_Jaccard/std": 0.09084017798304558, "step": 2070, "train_speed(iter/s)": 0.036887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 111.6, "completions/mean_length": 85.575, "completions/min_length": 60.0, "epoch": 0.41919191919191917, "grad_norm": 1.2637256383895874, "kl": 0.04155043140053749, "learning_rate": 3.303715580535693e-06, "loss": 0.04129897058010101, "memory(GiB)": 69.34, "reward": 0.6889864265918731, "reward_std": 0.3583144754171371, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.1889864146709442, "rewards/ReportKG_Jaccard/std": 0.060465405136346816, "step": 2075, "train_speed(iter/s)": 0.036849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.0, "completions/mean_length": 72.3, "completions/min_length": 54.2, "epoch": 0.4202020202020202, "grad_norm": 1.3979991674423218, "kl": 0.04376393705606461, "learning_rate": 3.297689180063277e-06, "loss": 0.03955722451210022, "memory(GiB)": 69.34, "reward": 0.8979395031929016, "reward_std": 0.3098613083362579, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.1479395031929016, "rewards/ReportKG_Jaccard/std": 0.06504066661000252, "step": 2080, "train_speed(iter/s)": 0.036864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.8, "completions/mean_length": 80.95, "completions/min_length": 61.0, "epoch": 0.4212121212121212, "grad_norm": 1.3148518800735474, "kl": 0.030339151620864868, "learning_rate": 3.2916423612948172e-06, "loss": 0.009433476626873017, "memory(GiB)": 69.34, "reward": 0.9369094640016555, "reward_std": 0.3605079486966133, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.28690946400165557, "rewards/ReportKG_Jaccard/std": 0.07022635191679001, "step": 2085, "train_speed(iter/s)": 0.036875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/mean_length": 75.875, "completions/min_length": 62.2, "epoch": 0.4222222222222222, "grad_norm": 1.0715919733047485, "kl": 0.04569384530186653, "learning_rate": 3.2855752193730786e-06, "loss": 0.007067376375198364, "memory(GiB)": 69.34, "reward": 0.9227349638938904, "reward_std": 0.2744250908493996, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.2977349877357483, "rewards/ReportKG_Jaccard/std": 0.09766178503632546, "step": 2090, "train_speed(iter/s)": 0.036884 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.2, "completions/mean_length": 76.025, "completions/min_length": 56.4, "epoch": 0.42323232323232324, "grad_norm": 1.0571054220199585, "kl": 0.04474864602088928, "learning_rate": 3.2794878497605966e-06, "loss": 0.05119055509567261, "memory(GiB)": 69.34, "reward": 0.9218868136405944, "reward_std": 0.3037242479622364, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.22188684046268464, "rewards/ReportKG_Jaccard/std": 0.07625945433974265, "step": 2095, "train_speed(iter/s)": 0.036864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.8, "completions/mean_length": 73.425, "completions/min_length": 59.2, "epoch": 0.42424242424242425, "grad_norm": 1.5409891605377197, "kl": 0.05802946761250496, "learning_rate": 3.2733803482381765e-06, "loss": 0.0180442675948143, "memory(GiB)": 69.34, "reward": 0.7558637201786041, "reward_std": 0.4054586783051491, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.38124017119407655, "rewards/ReportKG_Jaccard/mean": 0.25586370229721067, "rewards/ReportKG_Jaccard/std": 0.06724533885717392, "step": 2100, "train_speed(iter/s)": 0.036877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 80.4, "completions/mean_length": 62.175, "completions/min_length": 47.4, "epoch": 0.42525252525252527, "grad_norm": 1.3143818378448486, "kl": 0.033476078882813454, "learning_rate": 3.2672528109033862e-06, "loss": 0.03583422303199768, "memory(GiB)": 69.34, "reward": 0.7286647140979767, "reward_std": 0.2804489374160767, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.27866473197937014, "rewards/ReportKG_Jaccard/std": 0.07483936697244645, "step": 2105, "train_speed(iter/s)": 0.036901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 77.375, "completions/min_length": 61.2, "epoch": 0.4262626262626263, "grad_norm": 1.0415540933609009, "kl": 0.03548059649765491, "learning_rate": 3.261105334169045e-06, "loss": -0.028573739528656005, "memory(GiB)": 69.34, "reward": 1.0431537985801698, "reward_std": 0.3865456014871597, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.24315379112958907, "rewards/ReportKG_Jaccard/std": 0.09387611001729965, "step": 2110, "train_speed(iter/s)": 0.036923 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.8, "completions/mean_length": 68.525, "completions/min_length": 50.8, "epoch": 0.42727272727272725, "grad_norm": 1.2232565879821777, "kl": 0.03693252578377724, "learning_rate": 3.2549380147617037e-06, "loss": -0.04143472611904144, "memory(GiB)": 69.34, "reward": 0.8129944562911987, "reward_std": 0.38985239192843435, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.23799445629119872, "rewards/ReportKG_Jaccard/std": 0.04645215813070536, "step": 2115, "train_speed(iter/s)": 0.036934 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.8, "completions/mean_length": 79.3, "completions/min_length": 58.8, "epoch": 0.42828282828282827, "grad_norm": 1.2665834426879883, "kl": 0.04776054471731186, "learning_rate": 3.2487509497201266e-06, "loss": 0.02826957106590271, "memory(GiB)": 69.34, "reward": 1.0163305878639222, "reward_std": 0.349671496450901, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.16633058339357376, "rewards/ReportKG_Jaccard/std": 0.05837309285998345, "step": 2120, "train_speed(iter/s)": 0.03695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 113.8, "completions/mean_length": 77.05, "completions/min_length": 44.8, "epoch": 0.4292929292929293, "grad_norm": 1.3609609603881836, "kl": 0.030678100138902663, "learning_rate": 3.2425442363937617e-06, "loss": -0.05464673042297363, "memory(GiB)": 69.34, "reward": 0.47679429650306704, "reward_std": 0.42580206990242003, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.12679430991411209, "rewards/ReportKG_Jaccard/std": 0.057907982170581816, "step": 2125, "train_speed(iter/s)": 0.036925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.2, "completions/mean_length": 74.275, "completions/min_length": 55.8, "epoch": 0.4303030303030303, "grad_norm": 1.0627743005752563, "kl": 0.05607433170080185, "learning_rate": 3.2363179724412104e-06, "loss": 0.0366952121257782, "memory(GiB)": 69.34, "reward": 0.9301650643348693, "reward_std": 0.4742901146411896, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.4553455114364624, "rewards/ReportKG_Jaccard/mean": 0.23016507029533387, "rewards/ReportKG_Jaccard/std": 0.05082646533846855, "step": 2130, "train_speed(iter/s)": 0.036957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.6, "completions/mean_length": 81.55, "completions/min_length": 62.0, "epoch": 0.4313131313131313, "grad_norm": 1.1924933195114136, "kl": 0.02796958535909653, "learning_rate": 3.23007225582869e-06, "loss": 0.024328891932964326, "memory(GiB)": 69.34, "reward": 0.9320003271102906, "reward_std": 0.41627522110939025, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.4116185367107391, "rewards/ReportKG_Jaccard/mean": 0.1570003241300583, "rewards/ReportKG_Jaccard/std": 0.041930904239416124, "step": 2135, "train_speed(iter/s)": 0.036961 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.0, "completions/mean_length": 76.9, "completions/min_length": 51.8, "epoch": 0.43232323232323233, "grad_norm": 1.8916486501693726, "kl": 0.0293037548661232, "learning_rate": 3.2238071848284933e-06, "loss": 0.04843917489051819, "memory(GiB)": 69.34, "reward": 0.45618068873882295, "reward_std": 0.2905508458614349, "rewards/MultiModalAccuracyORM_Any/mean": 0.25, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.20618068873882295, "rewards/ReportKG_Jaccard/std": 0.06970761641860009, "step": 2140, "train_speed(iter/s)": 0.036975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.6, "completions/mean_length": 72.675, "completions/min_length": 53.4, "epoch": 0.43333333333333335, "grad_norm": 1.4958592653274536, "kl": 0.02921752892434597, "learning_rate": 3.217522858017441e-06, "loss": 0.0326349139213562, "memory(GiB)": 69.34, "reward": 0.8038552016019821, "reward_std": 0.3039001628756523, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.22885520458221437, "rewards/ReportKG_Jaccard/std": 0.11592551246285439, "step": 2145, "train_speed(iter/s)": 0.036985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.6, "completions/mean_length": 68.025, "completions/min_length": 53.6, "epoch": 0.43434343434343436, "grad_norm": 1.5133602619171143, "kl": 0.045952131226658824, "learning_rate": 3.211219374275333e-06, "loss": 0.05630273222923279, "memory(GiB)": 69.34, "reward": 0.7202868819236755, "reward_std": 0.42282580882310866, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.38124017119407655, "rewards/ReportKG_Jaccard/mean": 0.22028686702251435, "rewards/ReportKG_Jaccard/std": 0.07258662059903145, "step": 2150, "train_speed(iter/s)": 0.037008 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.6, "completions/mean_length": 66.275, "completions/min_length": 48.4, "epoch": 0.4353535353535353, "grad_norm": 1.023567795753479, "kl": 0.047566935420036316, "learning_rate": 3.2048968327833902e-06, "loss": 0.06465665698051452, "memory(GiB)": 69.34, "reward": 0.8718412399291993, "reward_std": 0.27750842571258544, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.29684124439954757, "rewards/ReportKG_Jaccard/std": 0.07494108229875565, "step": 2155, "train_speed(iter/s)": 0.037043 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 76.0, "completions/mean_length": 61.925, "completions/min_length": 47.6, "epoch": 0.43636363636363634, "grad_norm": 1.028537392616272, "kl": 0.03826615791767836, "learning_rate": 3.1985553330226935e-06, "loss": 0.0023768961429595946, "memory(GiB)": 69.34, "reward": 0.5335816502571106, "reward_std": 0.35607210621237756, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.20858165621757507, "rewards/ReportKG_Jaccard/std": 0.061163203418254854, "step": 2160, "train_speed(iter/s)": 0.037064 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.4, "completions/mean_length": 69.5, "completions/min_length": 54.2, "epoch": 0.43737373737373736, "grad_norm": 1.9504408836364746, "kl": 0.02473407853394747, "learning_rate": 3.1921949747726227e-06, "loss": 0.013884896039962768, "memory(GiB)": 69.34, "reward": 0.650429117679596, "reward_std": 0.41414561569690705, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.2504291146993637, "rewards/ReportKG_Jaccard/std": 0.0776828821748495, "step": 2165, "train_speed(iter/s)": 0.037063 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.8, "completions/mean_length": 70.225, "completions/min_length": 49.2, "epoch": 0.4383838383838384, "grad_norm": 1.6320337057113647, "kl": 0.0429587546736002, "learning_rate": 3.185815858109281e-06, "loss": 0.02475445866584778, "memory(GiB)": 69.34, "reward": 0.9224824666976928, "reward_std": 0.34529740214347837, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.12248246222734452, "rewards/ReportKG_Jaccard/std": 0.04613770432770252, "step": 2170, "train_speed(iter/s)": 0.03708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.8, "completions/mean_length": 67.425, "completions/min_length": 50.6, "epoch": 0.4393939393939394, "grad_norm": 1.6025909185409546, "kl": 0.042652325704693794, "learning_rate": 3.1794180834039242e-06, "loss": 0.019135190546512602, "memory(GiB)": 69.34, "reward": 0.8941025972366333, "reward_std": 0.43028574585914614, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3921836853027344, "rewards/ReportKG_Jaccard/mean": 0.24410259425640107, "rewards/ReportKG_Jaccard/std": 0.08318693451583385, "step": 2175, "train_speed(iter/s)": 0.0371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.6, "completions/mean_length": 72.3, "completions/min_length": 49.2, "epoch": 0.4404040404040404, "grad_norm": 1.5459145307540894, "kl": 0.029407751560211182, "learning_rate": 3.1730017513213804e-06, "loss": 0.0369245707988739, "memory(GiB)": 69.34, "reward": 0.6054251998662948, "reward_std": 0.3888263776898384, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.23042519986629487, "rewards/ReportKG_Jaccard/std": 0.061946692317724227, "step": 2180, "train_speed(iter/s)": 0.037114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 70.2, "completions/min_length": 54.0, "epoch": 0.4414141414141414, "grad_norm": 1.0999666452407837, "kl": 0.03971602469682693, "learning_rate": 3.166566962818466e-06, "loss": 0.0067515261471271515, "memory(GiB)": 69.34, "reward": 0.7850148320198059, "reward_std": 0.5316619634628296, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.5024828433990478, "rewards/ReportKG_Jaccard/mean": 0.21001481711864473, "rewards/ReportKG_Jaccard/std": 0.052336320839822294, "step": 2185, "train_speed(iter/s)": 0.037133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.8, "completions/mean_length": 77.825, "completions/min_length": 57.4, "epoch": 0.44242424242424244, "grad_norm": 1.3605865240097046, "kl": 0.04849878810346127, "learning_rate": 3.1601138191423963e-06, "loss": -0.012880191206932068, "memory(GiB)": 69.34, "reward": 0.6027083039283753, "reward_std": 0.4986249566078186, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.4881446659564972, "rewards/ReportKG_Jaccard/mean": 0.15270828753709792, "rewards/ReportKG_Jaccard/std": 0.03173781558871269, "step": 2190, "train_speed(iter/s)": 0.037157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.6, "completions/mean_length": 76.65, "completions/min_length": 62.0, "epoch": 0.44343434343434346, "grad_norm": 1.4935790300369263, "kl": 0.04789825975894928, "learning_rate": 3.1536424218291947e-06, "loss": 0.020464815199375153, "memory(GiB)": 69.34, "reward": 0.4849762976169586, "reward_std": 0.3626245245337486, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.2099762998521328, "rewards/ReportKG_Jaccard/std": 0.08064576387405395, "step": 2195, "train_speed(iter/s)": 0.037163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.0, "completions/mean_length": 82.375, "completions/min_length": 65.6, "epoch": 0.4444444444444444, "grad_norm": 1.1533235311508179, "kl": 0.03891930393874645, "learning_rate": 3.147152872702092e-06, "loss": 0.037910208106040955, "memory(GiB)": 69.34, "reward": 1.110875678062439, "reward_std": 0.3177148371934891, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.26087566614151003, "rewards/ReportKG_Jaccard/std": 0.0698818914592266, "step": 2200, "train_speed(iter/s)": 0.037168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.8, "completions/mean_length": 70.325, "completions/min_length": 51.6, "epoch": 0.44545454545454544, "grad_norm": 1.238495111465454, "kl": 0.03330329619348049, "learning_rate": 3.140645273869928e-06, "loss": 0.021097236871719362, "memory(GiB)": 69.34, "reward": 1.015767228603363, "reward_std": 0.32883541882038114, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.21576724648475648, "rewards/ReportKG_Jaccard/std": 0.11617882549762726, "step": 2205, "train_speed(iter/s)": 0.037171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.2, "completions/mean_length": 72.075, "completions/min_length": 55.8, "epoch": 0.44646464646464645, "grad_norm": 1.2123228311538696, "kl": 0.030745989829301833, "learning_rate": 3.134119727725541e-06, "loss": 0.028390339016914366, "memory(GiB)": 69.34, "reward": 0.7363202571868896, "reward_std": 0.3356855735182762, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.299601674079895, "rewards/ReportKG_Jaccard/mean": 0.18632026314735411, "rewards/ReportKG_Jaccard/std": 0.05471786819398403, "step": 2210, "train_speed(iter/s)": 0.037155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 113.4, "completions/mean_length": 85.625, "completions/min_length": 66.4, "epoch": 0.44747474747474747, "grad_norm": 1.3407719135284424, "kl": 0.037312401458621025, "learning_rate": 3.1275763369441606e-06, "loss": 0.02495434880256653, "memory(GiB)": 69.34, "reward": 0.9548536479473114, "reward_std": 0.2815326064825058, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2548536777496338, "rewards/ReportKG_Jaccard/std": 0.06731217056512832, "step": 2215, "train_speed(iter/s)": 0.037135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.8, "completions/mean_length": 80.725, "completions/min_length": 63.0, "epoch": 0.4484848484848485, "grad_norm": 0.9985986948013306, "kl": 0.03997566159814596, "learning_rate": 3.1210152044817876e-06, "loss": -0.016781315207481384, "memory(GiB)": 69.34, "reward": 1.0077873945236206, "reward_std": 0.3501614287495613, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.28278741538524627, "rewards/ReportKG_Jaccard/std": 0.09174927100539207, "step": 2220, "train_speed(iter/s)": 0.037137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.4, "completions/mean_length": 83.175, "completions/min_length": 63.4, "epoch": 0.4494949494949495, "grad_norm": 1.451905369758606, "kl": 0.03784034885466099, "learning_rate": 3.1144364335735795e-06, "loss": 0.017689049243927002, "memory(GiB)": 69.34, "reward": 0.812034261226654, "reward_std": 0.5242096841335296, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.5066213369369507, "rewards/ReportKG_Jaccard/mean": 0.262034273147583, "rewards/ReportKG_Jaccard/std": 0.0752962626516819, "step": 2225, "train_speed(iter/s)": 0.037136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.8, "completions/mean_length": 69.15, "completions/min_length": 55.0, "epoch": 0.4505050505050505, "grad_norm": 1.0547778606414795, "kl": 0.052710967138409615, "learning_rate": 3.1078401277322207e-06, "loss": -0.007481781393289566, "memory(GiB)": 69.34, "reward": 0.8166802525520325, "reward_std": 0.36084731072187426, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.21668027639389037, "rewards/ReportKG_Jaccard/std": 0.04796922393143177, "step": 2230, "train_speed(iter/s)": 0.03716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.8, "completions/mean_length": 70.05, "completions/min_length": 58.2, "epoch": 0.45151515151515154, "grad_norm": 1.4338555335998535, "kl": 0.033339330554008485, "learning_rate": 3.1012263907462973e-06, "loss": -0.0034129742532968523, "memory(GiB)": 69.34, "reward": 1.0034459233283997, "reward_std": 0.2934399783611298, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.22844592928886415, "rewards/ReportKG_Jaccard/std": 0.05365284234285354, "step": 2235, "train_speed(iter/s)": 0.037163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.8, "completions/mean_length": 83.225, "completions/min_length": 64.8, "epoch": 0.45252525252525255, "grad_norm": 1.1836220026016235, "kl": 0.046014399453997615, "learning_rate": 3.094595326678665e-06, "loss": 0.02475656569004059, "memory(GiB)": 69.34, "reward": 0.7711865544319153, "reward_std": 0.3748429283499718, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.2211865544319153, "rewards/ReportKG_Jaccard/std": 0.08185187503695487, "step": 2240, "train_speed(iter/s)": 0.03716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.6, "completions/mean_length": 65.65, "completions/min_length": 49.8, "epoch": 0.4535353535353535, "grad_norm": 1.6305748224258423, "kl": 0.03773084618151188, "learning_rate": 3.087947039864806e-06, "loss": 0.0876004457473755, "memory(GiB)": 69.34, "reward": 1.111200761795044, "reward_std": 0.17372752353549004, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.2862007707357407, "rewards/ReportKG_Jaccard/std": 0.1143163226544857, "step": 2245, "train_speed(iter/s)": 0.037189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.2, "completions/mean_length": 76.375, "completions/min_length": 55.6, "epoch": 0.45454545454545453, "grad_norm": 1.7779560089111328, "kl": 0.042601145803928375, "learning_rate": 3.0812816349111954e-06, "loss": -0.0506343424320221, "memory(GiB)": 69.34, "reward": 0.996980893611908, "reward_std": 0.20613915771245955, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.19698088765144348, "rewards/ReportKG_Jaccard/std": 0.07082885652780532, "step": 2250, "train_speed(iter/s)": 0.037168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.2, "completions/mean_length": 69.775, "completions/min_length": 46.6, "epoch": 0.45555555555555555, "grad_norm": 1.5738493204116821, "kl": 0.0551819808781147, "learning_rate": 3.074599216693648e-06, "loss": -0.04660596549510956, "memory(GiB)": 69.34, "reward": 1.0344456553459167, "reward_std": 0.45724923610687257, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.41916738748550414, "rewards/ReportKG_Jaccard/mean": 0.23444568514823913, "rewards/ReportKG_Jaccard/std": 0.09621574357151985, "step": 2255, "train_speed(iter/s)": 0.037177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.0, "completions/mean_length": 74.525, "completions/min_length": 59.0, "epoch": 0.45656565656565656, "grad_norm": 1.2720264196395874, "kl": 0.024331217631697655, "learning_rate": 3.0678998903556722e-06, "loss": 0.06501330137252807, "memory(GiB)": 69.34, "reward": 0.9582816779613494, "reward_std": 0.3350313052535057, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2832816869020462, "rewards/ReportKG_Jaccard/std": 0.08735342025756836, "step": 2260, "train_speed(iter/s)": 0.037204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.4, "completions/mean_length": 69.975, "completions/min_length": 56.6, "epoch": 0.4575757575757576, "grad_norm": 1.0464247465133667, "kl": 0.035893242061138156, "learning_rate": 3.061183761306816e-06, "loss": 0.060890835523605344, "memory(GiB)": 69.34, "reward": 1.092944896221161, "reward_std": 0.281975332647562, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.2429449141025543, "rewards/ReportKG_Jaccard/std": 0.04704473167657852, "step": 2265, "train_speed(iter/s)": 0.037223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.4, "completions/mean_length": 73.25, "completions/min_length": 59.8, "epoch": 0.4585858585858586, "grad_norm": 1.5744943618774414, "kl": 0.060677670314908026, "learning_rate": 3.0544509352210046e-06, "loss": 0.07411064505577088, "memory(GiB)": 69.34, "reward": 1.1417810916900635, "reward_std": 0.35015862584114077, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.2917810767889023, "rewards/ReportKG_Jaccard/std": 0.0635044950991869, "step": 2270, "train_speed(iter/s)": 0.037227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.6, "completions/mean_length": 83.225, "completions/min_length": 61.6, "epoch": 0.4595959595959596, "grad_norm": 1.3180164098739624, "kl": 0.026389727368950844, "learning_rate": 3.047701518034883e-06, "loss": -0.05485573410987854, "memory(GiB)": 69.34, "reward": 0.8879529476165772, "reward_std": 0.27001051157712935, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.16295297741889953, "rewards/ReportKG_Jaccard/std": 0.0547323614358902, "step": 2275, "train_speed(iter/s)": 0.037192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.4, "completions/mean_length": 89.925, "completions/min_length": 69.4, "epoch": 0.46060606060606063, "grad_norm": 1.1884781122207642, "kl": 0.04439656250178814, "learning_rate": 3.0409356159461444e-06, "loss": -0.015963684022426605, "memory(GiB)": 69.34, "reward": 0.947792899608612, "reward_std": 0.34613641649484633, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.12279290184378625, "rewards/ReportKG_Jaccard/std": 0.03858703896403313, "step": 2280, "train_speed(iter/s)": 0.037176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.8, "completions/mean_length": 78.4, "completions/min_length": 58.8, "epoch": 0.4616161616161616, "grad_norm": 1.3079936504364014, "kl": 0.03467754274606705, "learning_rate": 3.0341533354118616e-06, "loss": 0.028581866621971132, "memory(GiB)": 69.34, "reward": 0.6641410499811172, "reward_std": 0.24536407738924026, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.1851640224456787, "rewards/ReportKG_Jaccard/mean": 0.16414104253053666, "rewards/ReportKG_Jaccard/std": 0.0734124943614006, "step": 2285, "train_speed(iter/s)": 0.037187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 73.875, "completions/min_length": 56.6, "epoch": 0.4626262626262626, "grad_norm": 1.3288899660110474, "kl": 0.02563426773995161, "learning_rate": 3.027354783146813e-06, "loss": -0.014439022541046143, "memory(GiB)": 69.34, "reward": 0.570756733417511, "reward_std": 0.35341678708791735, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.19575673043727876, "rewards/ReportKG_Jaccard/std": 0.06824287176132202, "step": 2290, "train_speed(iter/s)": 0.03717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.6, "completions/mean_length": 70.65, "completions/min_length": 53.0, "epoch": 0.4636363636363636, "grad_norm": 1.3458964824676514, "kl": 0.034535855427384375, "learning_rate": 3.0205400661217995e-06, "loss": 0.09424914717674256, "memory(GiB)": 69.34, "reward": 0.8984806180000305, "reward_std": 0.2046557568013668, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.22348060011863707, "rewards/ReportKG_Jaccard/std": 0.0537574477493763, "step": 2295, "train_speed(iter/s)": 0.037187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.6, "completions/mean_length": 85.85, "completions/min_length": 64.8, "epoch": 0.46464646464646464, "grad_norm": 1.2050834894180298, "kl": 0.03932952545583248, "learning_rate": 3.013709291561966e-06, "loss": 0.034259301424026486, "memory(GiB)": 69.34, "reward": 0.8429034978151322, "reward_std": 0.4058990508317947, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.21790347769856452, "rewards/ReportKG_Jaccard/std": 0.07794686928391456, "step": 2300, "train_speed(iter/s)": 0.037187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 75.8, "completions/mean_length": 64.225, "completions/min_length": 53.0, "epoch": 0.46565656565656566, "grad_norm": 2.065251588821411, "kl": 0.029086319915950298, "learning_rate": 3.0068625669451116e-06, "loss": 0.01173483282327652, "memory(GiB)": 69.34, "reward": 1.0894884049892426, "reward_std": 0.22136652320623398, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.289488410949707, "rewards/ReportKG_Jaccard/std": 0.10426219552755356, "step": 2305, "train_speed(iter/s)": 0.03718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.8, "completions/mean_length": 81.1, "completions/min_length": 60.0, "epoch": 0.4666666666666667, "grad_norm": 1.113377571105957, "kl": 0.05040336884558201, "learning_rate": 3e-06, "loss": 0.010916664451360702, "memory(GiB)": 69.34, "reward": 0.9726512670516968, "reward_std": 0.3026274651288986, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.17265123426914214, "rewards/ReportKG_Jaccard/std": 0.06892393380403519, "step": 2310, "train_speed(iter/s)": 0.037193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.6, "completions/mean_length": 67.725, "completions/min_length": 45.0, "epoch": 0.4676767676767677, "grad_norm": 1.270076036453247, "kl": 0.038900750875473025, "learning_rate": 2.9931216987046623e-06, "loss": 0.042980581521987915, "memory(GiB)": 69.34, "reward": 0.808109450340271, "reward_std": 0.4832286357879639, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.45536120533943175, "rewards/ReportKG_Jaccard/mean": 0.1831094354391098, "rewards/ReportKG_Jaccard/std": 0.06036430709064007, "step": 2315, "train_speed(iter/s)": 0.03721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.8, "completions/mean_length": 71.775, "completions/min_length": 56.8, "epoch": 0.4686868686868687, "grad_norm": 1.384459376335144, "kl": 0.034354706108570096, "learning_rate": 2.9862277712847004e-06, "loss": 0.017540933191776277, "memory(GiB)": 69.34, "reward": 0.8878884434700012, "reward_std": 0.40615976303815843, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3552303433418274, "rewards/ReportKG_Jaccard/mean": 0.2878884136676788, "rewards/ReportKG_Jaccard/std": 0.1332600511610508, "step": 2320, "train_speed(iter/s)": 0.03723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.4, "completions/mean_length": 76.65, "completions/min_length": 49.4, "epoch": 0.4696969696969697, "grad_norm": 1.297957420349121, "kl": 0.03663063123822212, "learning_rate": 2.9793183262115823e-06, "loss": -0.014770416915416718, "memory(GiB)": 69.34, "reward": 0.7560045301914216, "reward_std": 0.40976249277591703, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.20600451063364744, "rewards/ReportKG_Jaccard/std": 0.052338804304599765, "step": 2325, "train_speed(iter/s)": 0.037239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 76.4, "completions/mean_length": 55.425, "completions/min_length": 42.0, "epoch": 0.4707070707070707, "grad_norm": 1.1405115127563477, "kl": 0.05336366631090641, "learning_rate": 2.972393472200937e-06, "loss": 0.048301669955253604, "memory(GiB)": 69.34, "reward": 0.6687825620174408, "reward_std": 0.43480961918830874, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.1687825709581375, "rewards/ReportKG_Jaccard/std": 0.0573244409635663, "step": 2330, "train_speed(iter/s)": 0.037258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 68.175, "completions/min_length": 55.0, "epoch": 0.4717171717171717, "grad_norm": 1.4693628549575806, "kl": 0.024273376166820525, "learning_rate": 2.9654533182108435e-06, "loss": 0.07056596279144287, "memory(GiB)": 69.34, "reward": 1.1530682444572449, "reward_std": 0.4222117722034454, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.35306823551654815, "rewards/ReportKG_Jaccard/std": 0.0963135004043579, "step": 2335, "train_speed(iter/s)": 0.03728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 75.6, "completions/mean_length": 59.8, "completions/min_length": 46.4, "epoch": 0.4727272727272727, "grad_norm": 1.6533609628677368, "kl": 0.0329777292907238, "learning_rate": 2.9584979734401135e-06, "loss": 0.0348210334777832, "memory(GiB)": 69.34, "reward": 0.8020343661308289, "reward_std": 0.35569686591625216, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.12703437507152557, "rewards/ReportKG_Jaccard/std": 0.06510804072022439, "step": 2340, "train_speed(iter/s)": 0.037283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 80.8, "completions/mean_length": 65.375, "completions/min_length": 50.8, "epoch": 0.47373737373737373, "grad_norm": 1.3100700378417969, "kl": 0.0440786425024271, "learning_rate": 2.951527547326579e-06, "loss": 0.036370846629142764, "memory(GiB)": 69.34, "reward": 0.7640257984399795, "reward_std": 0.22044760063290597, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.2640257656574249, "rewards/ReportKG_Jaccard/std": 0.060833952575922015, "step": 2345, "train_speed(iter/s)": 0.037274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 67.45, "completions/min_length": 45.4, "epoch": 0.47474747474747475, "grad_norm": 1.2444279193878174, "kl": 0.03275593630969524, "learning_rate": 2.9445421495453654e-06, "loss": 0.03372513353824615, "memory(GiB)": 69.34, "reward": 0.9467801094055176, "reward_std": 0.29354586750268935, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.22178008854389192, "rewards/ReportKG_Jaccard/std": 0.06593890190124511, "step": 2350, "train_speed(iter/s)": 0.037244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.8, "completions/mean_length": 85.95, "completions/min_length": 67.8, "epoch": 0.47575757575757577, "grad_norm": 1.3085757493972778, "kl": 0.04557507336139679, "learning_rate": 2.9375418900071675e-06, "loss": 0.009865665435791015, "memory(GiB)": 69.34, "reward": 0.8035372018814086, "reward_std": 0.4457257747650146, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.17853719592094422, "rewards/ReportKG_Jaccard/std": 0.04206672087311745, "step": 2355, "train_speed(iter/s)": 0.037238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.4, "completions/mean_length": 66.525, "completions/min_length": 52.6, "epoch": 0.4767676767676768, "grad_norm": 0.7837011218070984, "kl": 0.07298685684800148, "learning_rate": 2.930526878856521e-06, "loss": 0.10924978256225586, "memory(GiB)": 69.34, "reward": 1.0484777688980103, "reward_std": 0.35158455073833467, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.32347774803638457, "rewards/ReportKG_Jaccard/std": 0.09235744625329971, "step": 2360, "train_speed(iter/s)": 0.03723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.6, "completions/mean_length": 62.625, "completions/min_length": 50.8, "epoch": 0.4777777777777778, "grad_norm": 1.500807762145996, "kl": 0.029808445647358895, "learning_rate": 2.923497226470068e-06, "loss": 0.006714872270822525, "memory(GiB)": 69.34, "reward": 0.7993951559066772, "reward_std": 0.3933232605457306, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.29939515590667726, "rewards/ReportKG_Jaccard/std": 0.06485679224133492, "step": 2365, "train_speed(iter/s)": 0.037257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.8, "completions/mean_length": 68.75, "completions/min_length": 51.0, "epoch": 0.47878787878787876, "grad_norm": 1.0913920402526855, "kl": 0.04608013294637203, "learning_rate": 2.9164530434548207e-06, "loss": 0.006052520126104355, "memory(GiB)": 69.34, "reward": 0.6715768039226532, "reward_std": 0.44566301703453065, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.1965768039226532, "rewards/ReportKG_Jaccard/std": 0.06261527314782142, "step": 2370, "train_speed(iter/s)": 0.037256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.4, "completions/mean_length": 80.425, "completions/min_length": 58.4, "epoch": 0.4797979797979798, "grad_norm": 0.8766714334487915, "kl": 0.034894535318017006, "learning_rate": 2.9093944406464223e-06, "loss": 0.013027581572532653, "memory(GiB)": 69.34, "reward": 0.8044621467590332, "reward_std": 0.44770256280899046, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.20446214973926544, "rewards/ReportKG_Jaccard/std": 0.054790768027305606, "step": 2375, "train_speed(iter/s)": 0.037266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.4, "completions/mean_length": 76.25, "completions/min_length": 57.2, "epoch": 0.4808080808080808, "grad_norm": 1.206784963607788, "kl": 0.048453055694699286, "learning_rate": 2.9023215291074014e-06, "loss": 0.025715354084968566, "memory(GiB)": 69.34, "reward": 1.2062895536422729, "reward_std": 0.27452014535665514, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.2812895327806473, "rewards/ReportKG_Jaccard/std": 0.0915838599205017, "step": 2380, "train_speed(iter/s)": 0.037278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/mean_length": 71.2, "completions/min_length": 53.4, "epoch": 0.4818181818181818, "grad_norm": 1.1780097484588623, "kl": 0.06122711189091205, "learning_rate": 2.895234420125425e-06, "loss": 0.019644108414649964, "memory(GiB)": 69.34, "reward": 0.9137413263320923, "reward_std": 0.4897688925266266, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.23874134719371795, "rewards/ReportKG_Jaccard/std": 0.07168719209730626, "step": 2385, "train_speed(iter/s)": 0.037285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.6, "completions/mean_length": 81.3, "completions/min_length": 66.6, "epoch": 0.48282828282828283, "grad_norm": 1.2903093099594116, "kl": 0.04331960901618004, "learning_rate": 2.8881332252115482e-06, "loss": 0.04721409380435944, "memory(GiB)": 69.34, "reward": 0.7116395592689514, "reward_std": 0.339777746796608, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.16163956820964814, "rewards/ReportKG_Jaccard/std": 0.04977225661277771, "step": 2390, "train_speed(iter/s)": 0.037293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.0, "completions/mean_length": 90.475, "completions/min_length": 67.4, "epoch": 0.48383838383838385, "grad_norm": 0.9303982257843018, "kl": 0.05075716115534305, "learning_rate": 2.8810180560984586e-06, "loss": 0.024666284024715424, "memory(GiB)": 69.34, "reward": 0.9453327775001525, "reward_std": 0.341342905163765, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.299601674079895, "rewards/ReportKG_Jaccard/mean": 0.19533276706933975, "rewards/ReportKG_Jaccard/std": 0.07435529157519341, "step": 2395, "train_speed(iter/s)": 0.037297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.0, "completions/mean_length": 91.125, "completions/min_length": 70.0, "epoch": 0.48484848484848486, "grad_norm": 1.0350823402404785, "kl": 0.03972895406186581, "learning_rate": 2.8738890247387187e-06, "loss": 0.008512541651725769, "memory(GiB)": 69.34, "reward": 0.8141757100820541, "reward_std": 0.27358385771512983, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.21417570114135742, "rewards/ReportKG_Jaccard/std": 0.05249550640583038, "step": 2400, "train_speed(iter/s)": 0.037308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.0, "completions/mean_length": 72.3, "completions/min_length": 57.8, "epoch": 0.4858585858585859, "grad_norm": 1.6259504556655884, "kl": 0.035232774913311005, "learning_rate": 2.8667462433030054e-06, "loss": -0.030330970883369446, "memory(GiB)": 69.34, "reward": 0.910465270280838, "reward_std": 0.30052012503147124, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.2604652941226959, "rewards/ReportKG_Jaccard/std": 0.08087525144219398, "step": 2405, "train_speed(iter/s)": 0.037323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 119.0, "completions/mean_length": 85.3, "completions/min_length": 57.4, "epoch": 0.4868686868686869, "grad_norm": 1.3060835599899292, "kl": 0.025691529177129267, "learning_rate": 2.8595898241783433e-06, "loss": -0.05115538835525513, "memory(GiB)": 69.34, "reward": 0.6741631597280502, "reward_std": 0.3512692615389824, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.31731882095336916, "rewards/ReportKG_Jaccard/mean": 0.19916318058967591, "rewards/ReportKG_Jaccard/std": 0.05904264822602272, "step": 2410, "train_speed(iter/s)": 0.037316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.2, "completions/mean_length": 71.325, "completions/min_length": 56.6, "epoch": 0.48787878787878786, "grad_norm": 1.3106170892715454, "kl": 0.039570364728569986, "learning_rate": 2.8524198799663365e-06, "loss": 0.043952721357345584, "memory(GiB)": 69.34, "reward": 1.0602641582489014, "reward_std": 0.3640130177140236, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.26026414930820463, "rewards/ReportKG_Jaccard/std": 0.08826415091753007, "step": 2415, "train_speed(iter/s)": 0.037314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.2, "completions/mean_length": 67.125, "completions/min_length": 52.8, "epoch": 0.4888888888888889, "grad_norm": 1.2695114612579346, "kl": 0.061475294455885884, "learning_rate": 2.845236523481399e-06, "loss": 0.04399910867214203, "memory(GiB)": 69.34, "reward": 0.574722820520401, "reward_std": 0.23598473891615868, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.22472280859947205, "rewards/ReportKG_Jaccard/std": 0.07426261156797409, "step": 2420, "train_speed(iter/s)": 0.037326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.6, "completions/mean_length": 77.525, "completions/min_length": 60.4, "epoch": 0.4898989898989899, "grad_norm": 1.1337774991989136, "kl": 0.03641211166977883, "learning_rate": 2.838039867748977e-06, "loss": -0.005209358409047127, "memory(GiB)": 69.34, "reward": 0.985934054851532, "reward_std": 0.29519990012049674, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2109340488910675, "rewards/ReportKG_Jaccard/std": 0.07257962450385094, "step": 2425, "train_speed(iter/s)": 0.037304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.0, "completions/mean_length": 71.025, "completions/min_length": 59.6, "epoch": 0.4909090909090909, "grad_norm": 0.6869730949401855, "kl": 0.08444352336227894, "learning_rate": 2.830830026003773e-06, "loss": 0.04394524693489075, "memory(GiB)": 69.34, "reward": 0.7933101415634155, "reward_std": 0.36150012612342836, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.16831013709306716, "rewards/ReportKG_Jaccard/std": 0.044504132121801376, "step": 2430, "train_speed(iter/s)": 0.037322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 71.225, "completions/min_length": 55.0, "epoch": 0.4919191919191919, "grad_norm": 1.4766589403152466, "kl": 0.02936662957072258, "learning_rate": 2.8236071116879614e-06, "loss": 0.06509698629379272, "memory(GiB)": 69.34, "reward": 0.687681233882904, "reward_std": 0.3165800258517265, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.2626812160015106, "rewards/ReportKG_Jaccard/std": 0.06556159481406212, "step": 2435, "train_speed(iter/s)": 0.037342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 118.0, "completions/mean_length": 83.575, "completions/min_length": 59.0, "epoch": 0.49292929292929294, "grad_norm": 1.369354009628296, "kl": 0.026151398196816444, "learning_rate": 2.816371238449406e-06, "loss": -0.03990768790245056, "memory(GiB)": 69.34, "reward": 0.5652624785900116, "reward_std": 0.48608211874961854, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.4478123545646667, "rewards/ReportKG_Jaccard/mean": 0.16526248604059218, "rewards/ReportKG_Jaccard/std": 0.05380699299275875, "step": 2440, "train_speed(iter/s)": 0.037343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.6, "completions/mean_length": 68.475, "completions/min_length": 55.2, "epoch": 0.49393939393939396, "grad_norm": 1.1696940660476685, "kl": 0.04118635281920433, "learning_rate": 2.80912252013987e-06, "loss": -0.02992696166038513, "memory(GiB)": 69.34, "reward": 1.058256733417511, "reward_std": 0.3274064928293228, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.23325674831867219, "rewards/ReportKG_Jaccard/std": 0.06777201183140277, "step": 2445, "train_speed(iter/s)": 0.037347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.4, "completions/mean_length": 68.55, "completions/min_length": 55.6, "epoch": 0.494949494949495, "grad_norm": 3.0020570755004883, "kl": 0.04440616145730018, "learning_rate": 2.8018610708132273e-06, "loss": 0.03858020007610321, "memory(GiB)": 69.34, "reward": 0.4596382945775986, "reward_std": 0.4486883819103241, "rewards/MultiModalAccuracyORM_Any/mean": 0.25, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.2096382886171341, "rewards/ReportKG_Jaccard/std": 0.044077922403812406, "step": 2450, "train_speed(iter/s)": 0.037372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 121.4, "completions/mean_length": 91.7, "completions/min_length": 60.0, "epoch": 0.49595959595959593, "grad_norm": 1.283844232559204, "kl": 0.024764401838183403, "learning_rate": 2.7945870047236636e-06, "loss": 0.005531685054302215, "memory(GiB)": 69.34, "reward": 0.7111502170562745, "reward_std": 0.38739774078130723, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.2111502021551132, "rewards/ReportKG_Jaccard/std": 0.05321626216173172, "step": 2455, "train_speed(iter/s)": 0.037343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.8, "completions/mean_length": 85.875, "completions/min_length": 68.2, "epoch": 0.49696969696969695, "grad_norm": 1.2916942834854126, "kl": 0.025029375031590462, "learning_rate": 2.787300436323883e-06, "loss": -0.03969949781894684, "memory(GiB)": 69.34, "reward": 0.41592428386211394, "reward_std": 0.24017678946256638, "rewards/MultiModalAccuracyORM_Any/mean": 0.225, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.1909242793917656, "rewards/ReportKG_Jaccard/std": 0.04643104746937752, "step": 2460, "train_speed(iter/s)": 0.03734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.4, "completions/mean_length": 69.6, "completions/min_length": 57.6, "epoch": 0.49797979797979797, "grad_norm": 1.2863420248031616, "kl": 0.042490940541028976, "learning_rate": 2.7800014802633035e-06, "loss": 0.007593570649623871, "memory(GiB)": 69.34, "reward": 0.8914391040802002, "reward_std": 0.5037807762622833, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.2664391100406647, "rewards/ReportKG_Jaccard/std": 0.0527681540697813, "step": 2465, "train_speed(iter/s)": 0.037368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.8, "completions/mean_length": 74.75, "completions/min_length": 54.2, "epoch": 0.498989898989899, "grad_norm": 1.8226549625396729, "kl": 0.029930275678634644, "learning_rate": 2.772690251386257e-06, "loss": 0.03462924361228943, "memory(GiB)": 69.34, "reward": 0.7705440193414688, "reward_std": 0.3064823687076569, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.22054402977228166, "rewards/ReportKG_Jaccard/std": 0.087401083111763, "step": 2470, "train_speed(iter/s)": 0.03737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.8, "completions/mean_length": 78.85, "completions/min_length": 59.2, "epoch": 0.5, "grad_norm": 1.888170599937439, "kl": 0.026276568695902825, "learning_rate": 2.7653668647301796e-06, "loss": -0.06396101117134094, "memory(GiB)": 69.34, "reward": 0.6586519241333008, "reward_std": 0.38563349172472955, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.23365192115306854, "rewards/ReportKG_Jaccard/std": 0.05985263437032699, "step": 2475, "train_speed(iter/s)": 0.037385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 117.2, "completions/mean_length": 84.875, "completions/min_length": 64.4, "epoch": 0.501010101010101, "grad_norm": 1.8143123388290405, "kl": 0.035914698243141176, "learning_rate": 2.758031435523801e-06, "loss": -0.015312404930591583, "memory(GiB)": 69.34, "reward": 0.9269901156425476, "reward_std": 0.46262293457984927, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.20199012011289597, "rewards/ReportKG_Jaccard/std": 0.05942768268287182, "step": 2480, "train_speed(iter/s)": 0.037369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.2, "completions/mean_length": 88.2, "completions/min_length": 65.0, "epoch": 0.502020202020202, "grad_norm": 1.383043885231018, "kl": 0.04872849658131599, "learning_rate": 2.7506840791853335e-06, "loss": 0.05613468885421753, "memory(GiB)": 69.34, "reward": 0.7686197757720947, "reward_std": 0.40479432344436644, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.4116185367107391, "rewards/ReportKG_Jaccard/mean": 0.1436197653412819, "rewards/ReportKG_Jaccard/std": 0.05502827502787113, "step": 2485, "train_speed(iter/s)": 0.037356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.6, "completions/mean_length": 81.6, "completions/min_length": 59.0, "epoch": 0.503030303030303, "grad_norm": 1.260719895362854, "kl": 0.024857469648122788, "learning_rate": 2.743324911320655e-06, "loss": 0.018419402837753295, "memory(GiB)": 69.34, "reward": 0.9319514155387878, "reward_std": 0.18601444512605667, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.09258201122283935, "rewards/ReportKG_Jaccard/mean": 0.2819514125585556, "rewards/ReportKG_Jaccard/std": 0.108526611328125, "step": 2490, "train_speed(iter/s)": 0.037352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 74.475, "completions/min_length": 59.0, "epoch": 0.5040404040404041, "grad_norm": 1.547063946723938, "kl": 0.03837917037308216, "learning_rate": 2.7359540477214904e-06, "loss": 0.07167366147041321, "memory(GiB)": 69.34, "reward": 0.9145090699195861, "reward_std": 0.5179825484752655, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.47382218241691587, "rewards/ReportKG_Jaccard/mean": 0.26450908184051514, "rewards/ReportKG_Jaccard/std": 0.07540268152952194, "step": 2495, "train_speed(iter/s)": 0.037369 }, { "epoch": 0.5050505050505051, "grad_norm": 1.142601490020752, "learning_rate": 2.7285716043635884e-06, "loss": 0.0262813538312912, "memory(GiB)": 69.34, "step": 2500, "train_speed(iter/s)": 0.037365 }, { "epoch": 0.5050505050505051, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 104.66, "eval_completions/mean_length": 78.4475, "eval_completions/min_length": 58.32, "eval_kl": 0.03638719728216529, "eval_loss": 0.007472493220120668, "eval_reward": 0.7370552970468998, "eval_reward_std": 0.31265824489295485, "eval_rewards/MultiModalAccuracyORM_Any/mean": 0.53, "eval_rewards/MultiModalAccuracyORM_Any/std": 0.2733822160959244, "eval_rewards/ReportKG_Jaccard/mean": 0.20705530144274234, "eval_rewards/ReportKG_Jaccard/std": 0.060105125047266485, "eval_runtime": 942.3001, "eval_samples_per_second": 0.053, "eval_steps_per_second": 0.007, "step": 2500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.2, "completions/mean_length": 85.0, "completions/min_length": 64.3, "epoch": 0.5060606060606061, "grad_norm": 1.1613320112228394, "kl": 0.046988018229603766, "learning_rate": 2.7211776974048993e-06, "loss": 0.03674744963645935, "memory(GiB)": 69.34, "reward": 0.8512899294495583, "reward_std": 0.34565289244055747, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3038551896810532, "rewards/ReportKG_Jaccard/mean": 0.22628992199897766, "rewards/ReportKG_Jaccard/std": 0.07900865394622088, "step": 2505, "train_speed(iter/s)": 0.036798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.4, "completions/mean_length": 75.525, "completions/min_length": 62.8, "epoch": 0.5070707070707071, "grad_norm": 1.0622332096099854, "kl": 0.039182035624980925, "learning_rate": 2.7137724431837433e-06, "loss": 0.03315341174602508, "memory(GiB)": 69.34, "reward": 0.693426051735878, "reward_std": 0.40754408240318296, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.24342605471611023, "rewards/ReportKG_Jaccard/std": 0.08092931434512138, "step": 2510, "train_speed(iter/s)": 0.036803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.6, "completions/mean_length": 65.675, "completions/min_length": 52.2, "epoch": 0.5080808080808081, "grad_norm": 1.2608764171600342, "kl": 0.03128378093242645, "learning_rate": 2.7063559582169854e-06, "loss": 0.022367137670516967, "memory(GiB)": 69.34, "reward": 0.8304434299468995, "reward_std": 0.3964572370052338, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.23044342994689943, "rewards/ReportKG_Jaccard/std": 0.07357453852891922, "step": 2515, "train_speed(iter/s)": 0.036817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.4, "completions/mean_length": 74.725, "completions/min_length": 58.4, "epoch": 0.509090909090909, "grad_norm": 1.4189027547836304, "kl": 0.040640837699174884, "learning_rate": 2.698928359198197e-06, "loss": 0.023150426149368287, "memory(GiB)": 69.34, "reward": 0.9596181571483612, "reward_std": 0.2963738486170769, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.20961817502975463, "rewards/ReportKG_Jaccard/std": 0.04835303239524365, "step": 2520, "train_speed(iter/s)": 0.036838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 68.8, "completions/min_length": 55.2, "epoch": 0.51010101010101, "grad_norm": 1.648171067237854, "kl": 0.03793911263346672, "learning_rate": 2.6914897629958214e-06, "loss": 0.007040271162986755, "memory(GiB)": 69.34, "reward": 0.7057561159133912, "reward_std": 0.33067440912127494, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.1807560846209526, "rewards/ReportKG_Jaccard/std": 0.04814482033252716, "step": 2525, "train_speed(iter/s)": 0.036843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.0, "completions/mean_length": 72.6, "completions/min_length": 54.4, "epoch": 0.5111111111111111, "grad_norm": 0.9919636249542236, "kl": 0.030599917098879815, "learning_rate": 2.6840402866513377e-06, "loss": 0.007536003738641739, "memory(GiB)": 69.34, "reward": 0.7023383021354676, "reward_std": 0.5353638648986816, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.49153932929039, "rewards/ReportKG_Jaccard/mean": 0.27733829617500305, "rewards/ReportKG_Jaccard/std": 0.08799230307340622, "step": 2530, "train_speed(iter/s)": 0.036844 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.8, "completions/mean_length": 74.6, "completions/min_length": 58.2, "epoch": 0.5121212121212121, "grad_norm": 1.3413037061691284, "kl": 0.03990805149078369, "learning_rate": 2.6765800473774147e-06, "loss": 0.09805830717086791, "memory(GiB)": 69.34, "reward": 1.1266251385211945, "reward_std": 0.19855418279767037, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.3266251504421234, "rewards/ReportKG_Jaccard/std": 0.08595681414008141, "step": 2535, "train_speed(iter/s)": 0.036838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.2, "completions/mean_length": 71.275, "completions/min_length": 58.0, "epoch": 0.5131313131313131, "grad_norm": 1.3768506050109863, "kl": 0.039457186684012414, "learning_rate": 2.6691091625560703e-06, "loss": 0.017019782960414887, "memory(GiB)": 69.34, "reward": 0.8843408286571502, "reward_std": 0.33897585421800613, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.2093408316373825, "rewards/ReportKG_Jaccard/std": 0.08293592445552349, "step": 2540, "train_speed(iter/s)": 0.036857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 117.8, "completions/mean_length": 81.85, "completions/min_length": 57.6, "epoch": 0.5141414141414141, "grad_norm": 0.9863182306289673, "kl": 0.03248449359089136, "learning_rate": 2.6616277497368236e-06, "loss": -0.009425513446331024, "memory(GiB)": 69.34, "reward": 0.5531758934259414, "reward_std": 0.464565372467041, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.4410387217998505, "rewards/ReportKG_Jaccard/mean": 0.12817590385675431, "rewards/ReportKG_Jaccard/std": 0.04643810093402863, "step": 2545, "train_speed(iter/s)": 0.036839 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 121.6, "completions/mean_length": 86.75, "completions/min_length": 61.6, "epoch": 0.5151515151515151, "grad_norm": 1.3123693466186523, "kl": 0.04681122712790966, "learning_rate": 2.6541359266348434e-06, "loss": -0.01082761138677597, "memory(GiB)": 69.34, "reward": 0.9107796907424927, "reward_std": 0.42583459466695783, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3880294978618622, "rewards/ReportKG_Jaccard/mean": 0.21077969968318938, "rewards/ReportKG_Jaccard/std": 0.06021316535770893, "step": 2550, "train_speed(iter/s)": 0.036829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.6, "completions/mean_length": 73.35, "completions/min_length": 58.4, "epoch": 0.5161616161616162, "grad_norm": 1.2871294021606445, "kl": 0.041391600668430326, "learning_rate": 2.6466338111290977e-06, "loss": 0.013293226063251496, "memory(GiB)": 69.34, "reward": 1.1174713492393493, "reward_std": 0.24967502653598786, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.2674713894724846, "rewards/ReportKG_Jaccard/std": 0.08608022183179856, "step": 2555, "train_speed(iter/s)": 0.036846 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 74.375, "completions/min_length": 53.4, "epoch": 0.5171717171717172, "grad_norm": 1.3888776302337646, "kl": 0.04557814449071884, "learning_rate": 2.6391215212605e-06, "loss": -0.003420111536979675, "memory(GiB)": 69.34, "reward": 0.7176326036453247, "reward_std": 0.3431906342506409, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.1426326259970665, "rewards/ReportKG_Jaccard/std": 0.059762436896562576, "step": 2560, "train_speed(iter/s)": 0.036852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.6, "completions/mean_length": 69.625, "completions/min_length": 54.4, "epoch": 0.5181818181818182, "grad_norm": 1.6266721487045288, "kl": 0.040244140475988385, "learning_rate": 2.63159917523005e-06, "loss": 0.027217817306518555, "memory(GiB)": 69.34, "reward": 0.72543506026268, "reward_std": 0.31815949380397796, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.17543503344058992, "rewards/ReportKG_Jaccard/std": 0.06032842099666595, "step": 2565, "train_speed(iter/s)": 0.036869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 79.2, "completions/mean_length": 65.05, "completions/min_length": 51.0, "epoch": 0.5191919191919192, "grad_norm": 1.512749433517456, "kl": 0.06046332381665707, "learning_rate": 2.6240668913969737e-06, "loss": 0.0306683212518692, "memory(GiB)": 69.34, "reward": 1.0355478882789613, "reward_std": 0.36745654940605166, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.299601674079895, "rewards/ReportKG_Jaccard/mean": 0.28554789125919344, "rewards/ReportKG_Jaccard/std": 0.09981019049882889, "step": 2570, "train_speed(iter/s)": 0.036875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.4, "completions/mean_length": 81.025, "completions/min_length": 63.0, "epoch": 0.5202020202020202, "grad_norm": 1.246014952659607, "kl": 0.040041156485676764, "learning_rate": 2.616524788276865e-06, "loss": 0.03855221271514893, "memory(GiB)": 69.34, "reward": 0.7132241368293762, "reward_std": 0.39540269374847414, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.16322412639856337, "rewards/ReportKG_Jaccard/std": 0.07560020834207534, "step": 2575, "train_speed(iter/s)": 0.036876 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 80.8, "completions/mean_length": 64.025, "completions/min_length": 49.0, "epoch": 0.5212121212121212, "grad_norm": 0.8486554026603699, "kl": 0.055747388675808904, "learning_rate": 2.6089729845398143e-06, "loss": 0.06317243576049805, "memory(GiB)": 69.34, "reward": 1.122553563117981, "reward_std": 0.3463422417640686, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.2725535839796066, "rewards/ReportKG_Jaccard/std": 0.061421144753694534, "step": 2580, "train_speed(iter/s)": 0.036898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.6, "completions/mean_length": 74.575, "completions/min_length": 54.2, "epoch": 0.5222222222222223, "grad_norm": 1.4402250051498413, "kl": 0.026017686910927297, "learning_rate": 2.6014115990085457e-06, "loss": 0.010870489478111266, "memory(GiB)": 69.34, "reward": 0.5492633670568466, "reward_std": 0.24171391427516936, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.1742633730173111, "rewards/ReportKG_Jaccard/std": 0.05902246423065662, "step": 2585, "train_speed(iter/s)": 0.03689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.6, "completions/mean_length": 72.0, "completions/min_length": 55.8, "epoch": 0.5232323232323233, "grad_norm": 1.5800580978393555, "kl": 0.03127300441265106, "learning_rate": 2.59384075065655e-06, "loss": -0.015001021325588226, "memory(GiB)": 69.34, "reward": 0.6328940033912659, "reward_std": 0.3086811020970345, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.13289400786161423, "rewards/ReportKG_Jaccard/std": 0.0714496210217476, "step": 2590, "train_speed(iter/s)": 0.03691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.2, "completions/mean_length": 69.775, "completions/min_length": 55.2, "epoch": 0.5242424242424243, "grad_norm": 1.3110188245773315, "kl": 0.027323388308286668, "learning_rate": 2.586260558606204e-06, "loss": 0.0043322786688804625, "memory(GiB)": 69.34, "reward": 0.5171697288751602, "reward_std": 0.24730593115091323, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.24216972291469574, "rewards/ReportKG_Jaccard/std": 0.044452738389372824, "step": 2595, "train_speed(iter/s)": 0.036918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.8, "completions/mean_length": 74.0, "completions/min_length": 52.0, "epoch": 0.5252525252525253, "grad_norm": 1.4461958408355713, "kl": 0.033171935379505156, "learning_rate": 2.5786711421269056e-06, "loss": 0.07007441520690919, "memory(GiB)": 69.34, "reward": 0.5890900313854217, "reward_std": 0.3105606213212013, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.2140900433063507, "rewards/ReportKG_Jaccard/std": 0.07595456242561341, "step": 2600, "train_speed(iter/s)": 0.036881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.8, "completions/mean_length": 72.75, "completions/min_length": 56.6, "epoch": 0.5262626262626262, "grad_norm": 1.0943052768707275, "kl": 0.02460206039249897, "learning_rate": 2.571072620633191e-06, "loss": -0.0008659698069095612, "memory(GiB)": 69.34, "reward": 1.1358157873153687, "reward_std": 0.2947754397988319, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.26081580221652984, "rewards/ReportKG_Jaccard/std": 0.07697845175862313, "step": 2605, "train_speed(iter/s)": 0.036899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 80.55, "completions/min_length": 62.4, "epoch": 0.5272727272727272, "grad_norm": 1.5324515104293823, "kl": 0.028449346870183946, "learning_rate": 2.5634651136828594e-06, "loss": 0.05765408277511597, "memory(GiB)": 69.34, "reward": 0.7960741221904755, "reward_std": 0.312664607167244, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.27107412815093995, "rewards/ReportKG_Jaccard/std": 0.07586443573236465, "step": 2610, "train_speed(iter/s)": 0.036894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.8, "completions/mean_length": 79.825, "completions/min_length": 59.8, "epoch": 0.5282828282828282, "grad_norm": 1.5316063165664673, "kl": 0.04392264746129513, "learning_rate": 2.555848740975089e-06, "loss": -0.00570012629032135, "memory(GiB)": 69.34, "reward": 0.7162598729133606, "reward_std": 0.4981889188289642, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.46289436221122743, "rewards/ReportKG_Jaccard/mean": 0.24125988483428956, "rewards/ReportKG_Jaccard/std": 0.07056925445795059, "step": 2615, "train_speed(iter/s)": 0.036881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 122.8, "completions/mean_length": 84.575, "completions/min_length": 57.0, "epoch": 0.5292929292929293, "grad_norm": 1.2065097093582153, "kl": 0.0331488911062479, "learning_rate": 2.5482236223485553e-06, "loss": 0.028087371587753297, "memory(GiB)": 69.34, "reward": 0.7698001086711883, "reward_std": 0.20109921544790268, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.14480012580752372, "rewards/ReportKG_Jaccard/std": 0.0537610225379467, "step": 2620, "train_speed(iter/s)": 0.03687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 80.0, "completions/mean_length": 65.4, "completions/min_length": 52.6, "epoch": 0.5303030303030303, "grad_norm": 1.08055579662323, "kl": 0.0415990274399519, "learning_rate": 2.5405898777795455e-06, "loss": 0.014516279101371765, "memory(GiB)": 69.34, "reward": 1.088149619102478, "reward_std": 0.24509579241275786, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.19948650598526002, "rewards/ReportKG_Jaccard/mean": 0.23814960420131684, "rewards/ReportKG_Jaccard/std": 0.07629338577389717, "step": 2625, "train_speed(iter/s)": 0.036871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.4, "completions/mean_length": 71.375, "completions/min_length": 54.8, "epoch": 0.5313131313131313, "grad_norm": 1.4320452213287354, "kl": 0.049700308591127396, "learning_rate": 2.5329476273800696e-06, "loss": 0.015437854826450348, "memory(GiB)": 69.34, "reward": 0.7353747010231018, "reward_std": 0.3420104175806046, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.21037468314170837, "rewards/ReportKG_Jaccard/std": 0.06504993438720703, "step": 2630, "train_speed(iter/s)": 0.036883 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.8, "completions/mean_length": 69.1, "completions/min_length": 48.6, "epoch": 0.5323232323232323, "grad_norm": 3.1763808727264404, "kl": 0.05255702212452888, "learning_rate": 2.525296991395973e-06, "loss": 0.019987143576145172, "memory(GiB)": 69.34, "reward": 0.9212943553924561, "reward_std": 0.280847904086113, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.24629438519477845, "rewards/ReportKG_Jaccard/std": 0.08681619688868522, "step": 2635, "train_speed(iter/s)": 0.036891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 72.0, "completions/mean_length": 59.975, "completions/min_length": 49.2, "epoch": 0.5333333333333333, "grad_norm": 1.2699023485183716, "kl": 0.030198436975479127, "learning_rate": 2.5176380902050414e-06, "loss": 0.012224763631820679, "memory(GiB)": 69.34, "reward": 0.8707063138484955, "reward_std": 0.4012010768055916, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.2207062989473343, "rewards/ReportKG_Jaccard/std": 0.04532121978700161, "step": 2640, "train_speed(iter/s)": 0.036912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.2, "completions/mean_length": 77.525, "completions/min_length": 55.8, "epoch": 0.5343434343434343, "grad_norm": 1.1918022632598877, "kl": 0.04689030796289444, "learning_rate": 2.5099710443151082e-06, "loss": 0.06835309267044068, "memory(GiB)": 69.34, "reward": 0.969048672914505, "reward_std": 0.2702245458960533, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.20701966285705567, "rewards/ReportKG_Jaccard/mean": 0.2190486952662468, "rewards/ReportKG_Jaccard/std": 0.07662567794322968, "step": 2645, "train_speed(iter/s)": 0.036903 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.6, "completions/mean_length": 70.0, "completions/min_length": 46.2, "epoch": 0.5353535353535354, "grad_norm": 1.4386959075927734, "kl": 0.029477788507938384, "learning_rate": 2.502295974362158e-06, "loss": 0.05430396795272827, "memory(GiB)": 69.34, "reward": 0.8410016119480133, "reward_std": 0.3864053964614868, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.1660016067326069, "rewards/ReportKG_Jaccard/std": 0.0888174258172512, "step": 2650, "train_speed(iter/s)": 0.036875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.2, "completions/mean_length": 77.125, "completions/min_length": 56.6, "epoch": 0.5363636363636364, "grad_norm": 1.4154413938522339, "kl": 0.040138349682092664, "learning_rate": 2.4946130011084306e-06, "loss": 0.05165087580680847, "memory(GiB)": 69.34, "reward": 0.8933942556381226, "reward_std": 0.19754408448934554, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.09258201122283935, "rewards/ReportKG_Jaccard/mean": 0.2433942437171936, "rewards/ReportKG_Jaccard/std": 0.10647514648735523, "step": 2655, "train_speed(iter/s)": 0.036869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.2, "completions/mean_length": 72.2, "completions/min_length": 47.2, "epoch": 0.5373737373737374, "grad_norm": 1.4607535600662231, "kl": 0.02761542797088623, "learning_rate": 2.4869222454405177e-06, "loss": -0.03836554288864136, "memory(GiB)": 69.34, "reward": 0.7038582384586334, "reward_std": 0.2878084257245064, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2538582384586334, "rewards/ReportKG_Jaccard/std": 0.07578910738229752, "step": 2660, "train_speed(iter/s)": 0.036872 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.6, "completions/mean_length": 71.75, "completions/min_length": 52.6, "epoch": 0.5383838383838384, "grad_norm": 1.4600858688354492, "kl": 0.03588134720921517, "learning_rate": 2.4792238283674623e-06, "loss": -0.032015687227249144, "memory(GiB)": 69.34, "reward": 0.8313702702522278, "reward_std": 0.4130531013011932, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.2563702329993248, "rewards/ReportKG_Jaccard/std": 0.07707587853074074, "step": 2665, "train_speed(iter/s)": 0.036858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.4, "completions/mean_length": 83.575, "completions/min_length": 57.2, "epoch": 0.5393939393939394, "grad_norm": 1.1702271699905396, "kl": 0.026710308343172073, "learning_rate": 2.4715178710188546e-06, "loss": 0.02316918075084686, "memory(GiB)": 69.34, "reward": 0.848870038986206, "reward_std": 0.49268399477005004, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.4553455114364624, "rewards/ReportKG_Jaccard/mean": 0.19887004643678666, "rewards/ReportKG_Jaccard/std": 0.0842143751680851, "step": 2670, "train_speed(iter/s)": 0.036864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 115.0, "completions/mean_length": 84.0, "completions/min_length": 62.8, "epoch": 0.5404040404040404, "grad_norm": 1.156792402267456, "kl": 0.033305196836590764, "learning_rate": 2.463804494642926e-06, "loss": 0.0012841105461120605, "memory(GiB)": 69.34, "reward": 0.9952332973480225, "reward_std": 0.3721383407711983, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.19523329734802247, "rewards/ReportKG_Jaccard/std": 0.0655131071805954, "step": 2675, "train_speed(iter/s)": 0.036867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.2, "completions/mean_length": 88.075, "completions/min_length": 63.4, "epoch": 0.5414141414141415, "grad_norm": 1.3270859718322754, "kl": 0.03574162386357784, "learning_rate": 2.4560838206046433e-06, "loss": 0.0499484121799469, "memory(GiB)": 69.34, "reward": 0.9564732104539871, "reward_std": 0.21176307275891304, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.2314732164144516, "rewards/ReportKG_Jaccard/std": 0.05719218850135803, "step": 2680, "train_speed(iter/s)": 0.036879 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.0, "completions/mean_length": 73.375, "completions/min_length": 55.6, "epoch": 0.5424242424242425, "grad_norm": 1.0593695640563965, "kl": 0.033152681589126584, "learning_rate": 2.448355970383794e-06, "loss": 0.025997853279113768, "memory(GiB)": 69.34, "reward": 1.1463645100593567, "reward_std": 0.3195883668959141, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2713645279407501, "rewards/ReportKG_Jaccard/std": 0.08744821809232235, "step": 2685, "train_speed(iter/s)": 0.036895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.2, "completions/mean_length": 75.175, "completions/min_length": 54.4, "epoch": 0.5434343434343434, "grad_norm": 1.5731470584869385, "kl": 0.033923492580652234, "learning_rate": 2.4406210655730814e-06, "loss": 0.006701156497001648, "memory(GiB)": 69.34, "reward": 0.9441970944404602, "reward_std": 0.37298251911997793, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.16919709444046022, "rewards/ReportKG_Jaccard/std": 0.04490819871425629, "step": 2690, "train_speed(iter/s)": 0.036905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 111.8, "completions/mean_length": 80.325, "completions/min_length": 58.0, "epoch": 0.5444444444444444, "grad_norm": 1.5943233966827393, "kl": 0.04713100716471672, "learning_rate": 2.4328792278762058e-06, "loss": 0.050475692749023436, "memory(GiB)": 69.34, "reward": 0.9734611630439758, "reward_std": 0.35696980357170105, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.24846116900444032, "rewards/ReportKG_Jaccard/std": 0.08938586935400963, "step": 2695, "train_speed(iter/s)": 0.036896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.2, "completions/mean_length": 64.1, "completions/min_length": 50.0, "epoch": 0.5454545454545454, "grad_norm": 1.7285666465759277, "kl": 0.03631706461310387, "learning_rate": 2.425130579105953e-06, "loss": 0.07781736850738526, "memory(GiB)": 69.34, "reward": 1.1606794238090514, "reward_std": 0.16642570346593857, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.10350983142852783, "rewards/ReportKG_Jaccard/mean": 0.23567941784858704, "rewards/ReportKG_Jaccard/std": 0.06695752441883088, "step": 2700, "train_speed(iter/s)": 0.036892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.4, "completions/mean_length": 61.725, "completions/min_length": 47.8, "epoch": 0.5464646464646464, "grad_norm": 1.274316430091858, "kl": 0.051355960220098494, "learning_rate": 2.4173752411822775e-06, "loss": 0.050342869758605954, "memory(GiB)": 69.34, "reward": 1.0458079338073731, "reward_std": 0.292253053933382, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.19580796360969543, "rewards/ReportKG_Jaccard/std": 0.05941811501979828, "step": 2705, "train_speed(iter/s)": 0.036893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.0, "completions/mean_length": 81.95, "completions/min_length": 64.2, "epoch": 0.5474747474747474, "grad_norm": 1.442641258239746, "kl": 0.032146578282117845, "learning_rate": 2.4096133361303815e-06, "loss": 0.0807762622833252, "memory(GiB)": 69.34, "reward": 0.7355417609214783, "reward_std": 0.1767820194363594, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.10350983142852783, "rewards/ReportKG_Jaccard/mean": 0.2605417728424072, "rewards/ReportKG_Jaccard/std": 0.08313431665301323, "step": 2710, "train_speed(iter/s)": 0.036895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.6, "completions/mean_length": 70.95, "completions/min_length": 47.8, "epoch": 0.5484848484848485, "grad_norm": 1.6996943950653076, "kl": 0.040942300111055374, "learning_rate": 2.4018449860787973e-06, "loss": 0.07716124057769776, "memory(GiB)": 69.34, "reward": 1.0272239685058593, "reward_std": 0.30287788063287735, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.2022239573299885, "rewards/ReportKG_Jaccard/std": 0.0792689174413681, "step": 2715, "train_speed(iter/s)": 0.036893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.8, "completions/mean_length": 73.775, "completions/min_length": 56.6, "epoch": 0.5494949494949495, "grad_norm": 1.2947442531585693, "kl": 0.03185653574764728, "learning_rate": 2.394070313257466e-06, "loss": 0.010906273126602173, "memory(GiB)": 69.34, "reward": 0.9742974460124969, "reward_std": 0.3716707944869995, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.2742974519729614, "rewards/ReportKG_Jaccard/std": 0.0913136899471283, "step": 2720, "train_speed(iter/s)": 0.03691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.8, "completions/mean_length": 73.8, "completions/min_length": 55.6, "epoch": 0.5505050505050505, "grad_norm": 1.2496947050094604, "kl": 0.025685600563883783, "learning_rate": 2.386289439995811e-06, "loss": 0.03876148760318756, "memory(GiB)": 69.34, "reward": 0.624637246131897, "reward_std": 0.3182599365711212, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2996372550725937, "rewards/ReportKG_Jaccard/std": 0.07263723090291023, "step": 2725, "train_speed(iter/s)": 0.036925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 111.4, "completions/mean_length": 84.525, "completions/min_length": 58.8, "epoch": 0.5515151515151515, "grad_norm": 1.2810887098312378, "kl": 0.05729696564376354, "learning_rate": 2.3785024887208203e-06, "loss": 0.007703512907028198, "memory(GiB)": 69.34, "reward": 0.8072011709213257, "reward_std": 0.3987865224480629, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.23220115303993225, "rewards/ReportKG_Jaccard/std": 0.07764562256634236, "step": 2730, "train_speed(iter/s)": 0.036932 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.2, "completions/mean_length": 77.75, "completions/min_length": 50.8, "epoch": 0.5525252525252525, "grad_norm": 1.4403172731399536, "kl": 0.0488728016614914, "learning_rate": 2.3707095819551105e-06, "loss": -0.030273249745368956, "memory(GiB)": 69.34, "reward": 0.9136798679828644, "reward_std": 0.35776644200086594, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.1886798892170191, "rewards/ReportKG_Jaccard/std": 0.05973528064787388, "step": 2735, "train_speed(iter/s)": 0.036939 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.4, "completions/mean_length": 81.075, "completions/min_length": 65.4, "epoch": 0.5535353535353535, "grad_norm": 0.6364297270774841, "kl": 0.06816506981849671, "learning_rate": 2.362910842315005e-06, "loss": 0.04266068041324615, "memory(GiB)": 69.34, "reward": 1.1881509304046631, "reward_std": 0.2514637246727943, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.2631509602069855, "rewards/ReportKG_Jaccard/std": 0.07997515574097633, "step": 2740, "train_speed(iter/s)": 0.036926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 72.975, "completions/min_length": 60.6, "epoch": 0.5545454545454546, "grad_norm": 1.1886365413665771, "kl": 0.025844671577215195, "learning_rate": 2.355106392508607e-06, "loss": 0.050321513414382936, "memory(GiB)": 69.34, "reward": 0.9559199094772339, "reward_std": 0.4149927109479904, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.2559199184179306, "rewards/ReportKG_Jaccard/std": 0.0659762904047966, "step": 2745, "train_speed(iter/s)": 0.036911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.2, "completions/mean_length": 79.6, "completions/min_length": 51.6, "epoch": 0.5555555555555556, "grad_norm": 1.2149461507797241, "kl": 0.045825076103210446, "learning_rate": 2.347296355333861e-06, "loss": -0.040417653322219846, "memory(GiB)": 69.34, "reward": 0.6486437439918518, "reward_std": 0.4732637107372284, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.4410230278968811, "rewards/ReportKG_Jaccard/mean": 0.14864376038312913, "rewards/ReportKG_Jaccard/std": 0.061279650777578354, "step": 2750, "train_speed(iter/s)": 0.036906 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 120.4, "completions/mean_length": 91.2, "completions/min_length": 60.2, "epoch": 0.5565656565656566, "grad_norm": 1.1876095533370972, "kl": 0.04658936783671379, "learning_rate": 2.339480853676628e-06, "loss": -0.013325807452201844, "memory(GiB)": 69.34, "reward": 1.0757296204566955, "reward_std": 0.2166646383702755, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.15072962641716003, "rewards/ReportKG_Jaccard/std": 0.03931460790336132, "step": 2755, "train_speed(iter/s)": 0.03689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.0, "completions/mean_length": 68.65, "completions/min_length": 53.0, "epoch": 0.5575757575757576, "grad_norm": 0.7738075256347656, "kl": 0.045190879702568056, "learning_rate": 2.3316600105087484e-06, "loss": 0.06316973567008972, "memory(GiB)": 69.34, "reward": 1.104158627986908, "reward_std": 0.33009479343891146, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.25415863990783694, "rewards/ReportKG_Jaccard/std": 0.09545463025569915, "step": 2760, "train_speed(iter/s)": 0.0369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.0, "completions/mean_length": 67.775, "completions/min_length": 50.2, "epoch": 0.5585858585858586, "grad_norm": 1.275774598121643, "kl": 0.05169812850654125, "learning_rate": 2.3238339488861074e-06, "loss": 0.02937674820423126, "memory(GiB)": 69.34, "reward": 0.8795137286186219, "reward_std": 0.24695156663656234, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.25451373159885404, "rewards/ReportKG_Jaccard/std": 0.08243275806307793, "step": 2765, "train_speed(iter/s)": 0.036892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.8, "completions/mean_length": 79.85, "completions/min_length": 58.0, "epoch": 0.5595959595959596, "grad_norm": 1.0686914920806885, "kl": 0.03175222538411617, "learning_rate": 2.3160027919467e-06, "loss": 0.010041096806526184, "memory(GiB)": 69.34, "reward": 0.9247438907623291, "reward_std": 0.23602011650800706, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.22474391162395477, "rewards/ReportKG_Jaccard/std": 0.07505133226513863, "step": 2770, "train_speed(iter/s)": 0.036884 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.8, "completions/mean_length": 70.45, "completions/min_length": 55.8, "epoch": 0.5606060606060606, "grad_norm": 1.6516233682632446, "kl": 0.04796867556869984, "learning_rate": 2.308166662908691e-06, "loss": -0.02471364140510559, "memory(GiB)": 69.34, "reward": 0.8020736902952195, "reward_std": 0.3420582994818687, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.15207368582487107, "rewards/ReportKG_Jaccard/std": 0.04167996756732464, "step": 2775, "train_speed(iter/s)": 0.036886 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/mean_length": 72.55, "completions/min_length": 60.4, "epoch": 0.5616161616161616, "grad_norm": 1.0740011930465698, "kl": 0.049113478511571884, "learning_rate": 2.3003256850684806e-06, "loss": 0.06761345267295837, "memory(GiB)": 69.34, "reward": 1.174748420715332, "reward_std": 0.31183076798915865, "rewards/MultiModalAccuracyORM_Any/mean": 0.9, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.2747483760118484, "rewards/ReportKG_Jaccard/std": 0.05740656889975071, "step": 2780, "train_speed(iter/s)": 0.036894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.8, "completions/mean_length": 87.35, "completions/min_length": 70.6, "epoch": 0.5626262626262626, "grad_norm": 1.3540847301483154, "kl": 0.03523118048906326, "learning_rate": 2.292479981798759e-06, "loss": 0.0468008279800415, "memory(GiB)": 69.34, "reward": 0.8872498542070388, "reward_std": 0.28602243065834043, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.21224985718727113, "rewards/ReportKG_Jaccard/std": 0.06415785402059555, "step": 2785, "train_speed(iter/s)": 0.036902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.4, "completions/mean_length": 70.25, "completions/min_length": 58.6, "epoch": 0.5636363636363636, "grad_norm": 1.1786646842956543, "kl": 0.03524694200605154, "learning_rate": 2.2846296765465706e-06, "loss": 0.03548694849014282, "memory(GiB)": 69.34, "reward": 1.064810585975647, "reward_std": 0.27657418698072433, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.19948650598526002, "rewards/ReportKG_Jaccard/mean": 0.31481059789657595, "rewards/ReportKG_Jaccard/std": 0.08960478156805038, "step": 2790, "train_speed(iter/s)": 0.036914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.4, "completions/mean_length": 70.275, "completions/min_length": 53.0, "epoch": 0.5646464646464646, "grad_norm": 1.3994886875152588, "kl": 0.02646980732679367, "learning_rate": 2.2767748928313657e-06, "loss": -0.02856876254081726, "memory(GiB)": 69.34, "reward": 0.8660288274288177, "reward_std": 0.30486361458897593, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.21602880954742432, "rewards/ReportKG_Jaccard/std": 0.07110866233706474, "step": 2795, "train_speed(iter/s)": 0.036907 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.2, "completions/mean_length": 73.625, "completions/min_length": 55.0, "epoch": 0.5656565656565656, "grad_norm": 1.0899277925491333, "kl": 0.04633402898907661, "learning_rate": 2.268915754243064e-06, "loss": 0.028436261415481567, "memory(GiB)": 69.34, "reward": 0.5969919323921203, "reward_std": 0.45202340483665465, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.17199191385880114, "rewards/ReportKG_Jaccard/std": 0.047269370034337045, "step": 2800, "train_speed(iter/s)": 0.036889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.8, "completions/mean_length": 73.725, "completions/min_length": 53.2, "epoch": 0.5666666666666667, "grad_norm": 0.9096480011940002, "kl": 0.05501449778676033, "learning_rate": 2.2610523844401034e-06, "loss": 0.03671562075614929, "memory(GiB)": 69.34, "reward": 1.0594428420066833, "reward_std": 0.4282618463039398, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.39729605317115785, "rewards/ReportKG_Jaccard/mean": 0.23444286584854127, "rewards/ReportKG_Jaccard/std": 0.09004001244902611, "step": 2805, "train_speed(iter/s)": 0.036874 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 124.0, "completions/mean_length": 88.65, "completions/min_length": 65.6, "epoch": 0.5676767676767677, "grad_norm": 1.3154120445251465, "kl": 0.028268988803029062, "learning_rate": 2.2531849071474984e-06, "loss": 0.05652427077293396, "memory(GiB)": 69.34, "reward": 0.6435429990291596, "reward_std": 0.3571448788046837, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.19354299008846282, "rewards/ReportKG_Jaccard/std": 0.047568701952695844, "step": 2810, "train_speed(iter/s)": 0.036845 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.0, "completions/mean_length": 63.85, "completions/min_length": 52.8, "epoch": 0.5686868686868687, "grad_norm": 1.441846489906311, "kl": 0.050879810377955435, "learning_rate": 2.2453134461548933e-06, "loss": 0.018610666692256927, "memory(GiB)": 69.34, "reward": 1.1282727241516113, "reward_std": 0.3264410957694054, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.3032726854085922, "rewards/ReportKG_Jaccard/std": 0.10491614490747452, "step": 2815, "train_speed(iter/s)": 0.036856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.8, "completions/mean_length": 76.15, "completions/min_length": 61.8, "epoch": 0.5696969696969697, "grad_norm": 1.3967766761779785, "kl": 0.04468559063971043, "learning_rate": 2.23743812531461e-06, "loss": 0.05342965126037598, "memory(GiB)": 69.34, "reward": 0.868577492237091, "reward_std": 0.41546338200569155, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.21857749819755554, "rewards/ReportKG_Jaccard/std": 0.08770475015044213, "step": 2820, "train_speed(iter/s)": 0.036867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 116.6, "completions/mean_length": 83.6, "completions/min_length": 62.0, "epoch": 0.5707070707070707, "grad_norm": 1.1562460660934448, "kl": 0.03516559526324272, "learning_rate": 2.2295590685397073e-06, "loss": 0.0036403283476829527, "memory(GiB)": 69.34, "reward": 0.7651915729045868, "reward_std": 0.32237585708498956, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.19019159376621247, "rewards/ReportKG_Jaccard/std": 0.05451641231775284, "step": 2825, "train_speed(iter/s)": 0.036865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.4, "completions/mean_length": 74.6, "completions/min_length": 59.0, "epoch": 0.5717171717171717, "grad_norm": 1.185490369796753, "kl": 0.0319850105792284, "learning_rate": 2.221676399802022e-06, "loss": 0.02803099751472473, "memory(GiB)": 69.34, "reward": 0.8259147703647614, "reward_std": 0.3217435419559479, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.22591478303074836, "rewards/ReportKG_Jaccard/std": 0.06831515431404114, "step": 2830, "train_speed(iter/s)": 0.03687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 71.1, "completions/min_length": 51.2, "epoch": 0.5727272727272728, "grad_norm": 1.227172613143921, "kl": 0.024999356642365456, "learning_rate": 2.213790243130226e-06, "loss": 0.08884393572807311, "memory(GiB)": 69.34, "reward": 0.7810479283332825, "reward_std": 0.209079253859818, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.15604790449142455, "rewards/ReportKG_Jaccard/std": 0.054822547174990174, "step": 2835, "train_speed(iter/s)": 0.03685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.2, "completions/mean_length": 77.95, "completions/min_length": 55.0, "epoch": 0.5737373737373738, "grad_norm": 1.2721341848373413, "kl": 0.04555640369653702, "learning_rate": 2.205900722607869e-06, "loss": 0.008568526804447174, "memory(GiB)": 69.34, "reward": 0.9098431229591369, "reward_std": 0.2720129892230034, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.20984311550855636, "rewards/ReportKG_Jaccard/std": 0.05854564495384693, "step": 2840, "train_speed(iter/s)": 0.036859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.6, "completions/mean_length": 73.625, "completions/min_length": 48.6, "epoch": 0.5747474747474748, "grad_norm": 1.754467487335205, "kl": 0.032803502306342124, "learning_rate": 2.198007962371431e-06, "loss": 0.016333034634590148, "memory(GiB)": 69.34, "reward": 0.8473554790019989, "reward_std": 0.4136831611394882, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.22235548943281175, "rewards/ReportKG_Jaccard/std": 0.11401189863681793, "step": 2845, "train_speed(iter/s)": 0.036874 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.2, "completions/mean_length": 60.55, "completions/min_length": 49.4, "epoch": 0.5757575757575758, "grad_norm": 1.5421055555343628, "kl": 0.04027213640511036, "learning_rate": 2.190112086608365e-06, "loss": 0.08448542356491089, "memory(GiB)": 69.34, "reward": 1.0653851985931397, "reward_std": 0.28796553760766985, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.26538515388965606, "rewards/ReportKG_Jaccard/std": 0.07566306814551353, "step": 2850, "train_speed(iter/s)": 0.036893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 71.9, "completions/min_length": 51.0, "epoch": 0.5767676767676768, "grad_norm": 1.4515979290008545, "kl": 0.04197903387248516, "learning_rate": 2.182213219555147e-06, "loss": 0.037319999933242795, "memory(GiB)": 69.34, "reward": 0.7072400063276291, "reward_std": 0.3385833911597729, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.299601674079895, "rewards/ReportKG_Jaccard/mean": 0.15723999142646788, "rewards/ReportKG_Jaccard/std": 0.05598461627960205, "step": 2855, "train_speed(iter/s)": 0.036877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 72.375, "completions/min_length": 56.0, "epoch": 0.5777777777777777, "grad_norm": 1.498836636543274, "kl": 0.03440258987247944, "learning_rate": 2.1743114854953166e-06, "loss": -0.0004281654953956604, "memory(GiB)": 69.34, "reward": 0.9522374033927917, "reward_std": 0.4801657021045685, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.4478123545646667, "rewards/ReportKG_Jaccard/mean": 0.2522373914718628, "rewards/ReportKG_Jaccard/std": 0.09426186978816986, "step": 2860, "train_speed(iter/s)": 0.036882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.6, "completions/mean_length": 70.975, "completions/min_length": 55.4, "epoch": 0.5787878787878787, "grad_norm": 1.2311943769454956, "kl": 0.0323231253772974, "learning_rate": 2.166407008757525e-06, "loss": -0.04315813779830933, "memory(GiB)": 69.34, "reward": 1.0109437823295593, "reward_std": 0.14870880991220475, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.23594380617141725, "rewards/ReportKG_Jaccard/std": 0.0920739471912384, "step": 2865, "train_speed(iter/s)": 0.036899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 116.4, "completions/mean_length": 88.075, "completions/min_length": 70.4, "epoch": 0.5797979797979798, "grad_norm": 1.3365153074264526, "kl": 0.029596728459000587, "learning_rate": 2.1584999137135767e-06, "loss": 0.040153992176055905, "memory(GiB)": 69.34, "reward": 1.2210144996643066, "reward_std": 0.22274519354104996, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.2960145056247711, "rewards/ReportKG_Jaccard/std": 0.07126249596476555, "step": 2870, "train_speed(iter/s)": 0.036889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.2, "completions/mean_length": 79.9, "completions/min_length": 58.6, "epoch": 0.5808080808080808, "grad_norm": 1.1722609996795654, "kl": 0.04012848734855652, "learning_rate": 2.1505903247764762e-06, "loss": -0.01411658525466919, "memory(GiB)": 69.34, "reward": 0.8153386980295181, "reward_std": 0.3472931519150734, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.265338708460331, "rewards/ReportKG_Jaccard/std": 0.06208024099469185, "step": 2875, "train_speed(iter/s)": 0.036905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.4, "completions/mean_length": 66.075, "completions/min_length": 49.6, "epoch": 0.5818181818181818, "grad_norm": 1.991445779800415, "kl": 0.03163397386670112, "learning_rate": 2.1426783663984645e-06, "loss": -0.04500848054885864, "memory(GiB)": 69.34, "reward": 0.9729408204555512, "reward_std": 0.27358400672674177, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.2979408085346222, "rewards/ReportKG_Jaccard/std": 0.061739854514598846, "step": 2880, "train_speed(iter/s)": 0.036912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.4, "completions/mean_length": 79.5, "completions/min_length": 59.2, "epoch": 0.5828282828282828, "grad_norm": 1.45590078830719, "kl": 0.03188483975827694, "learning_rate": 2.1347641630690666e-06, "loss": -0.01616826355457306, "memory(GiB)": 69.34, "reward": 0.7965272247791291, "reward_std": 0.45023933202028277, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4099008321762085, "rewards/ReportKG_Jaccard/mean": 0.22152723371982574, "rewards/ReportKG_Jaccard/std": 0.0608558289706707, "step": 2885, "train_speed(iter/s)": 0.036898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.6, "completions/mean_length": 76.35, "completions/min_length": 56.8, "epoch": 0.5838383838383838, "grad_norm": 0.9565982222557068, "kl": 0.034551508724689484, "learning_rate": 2.1268478393131287e-06, "loss": -0.029333308339118958, "memory(GiB)": 69.34, "reward": 0.6727509170770645, "reward_std": 0.42384873032569886, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.38124017119407655, "rewards/ReportKG_Jaccard/mean": 0.222750923037529, "rewards/ReportKG_Jaccard/std": 0.06588644459843636, "step": 2890, "train_speed(iter/s)": 0.036886 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.0, "completions/mean_length": 73.575, "completions/min_length": 57.4, "epoch": 0.5848484848484848, "grad_norm": 1.3098655939102173, "kl": 0.055530013889074324, "learning_rate": 2.118929519688862e-06, "loss": 0.09501435160636902, "memory(GiB)": 69.34, "reward": 1.064129364490509, "reward_std": 0.3223728150129318, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.23912939727306365, "rewards/ReportKG_Jaccard/std": 0.07646634578704833, "step": 2895, "train_speed(iter/s)": 0.036902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.4, "completions/mean_length": 73.4, "completions/min_length": 53.2, "epoch": 0.5858585858585859, "grad_norm": 1.4750508069992065, "kl": 0.04432537369430065, "learning_rate": 2.1110093287858803e-06, "loss": -0.04741841852664948, "memory(GiB)": 69.34, "reward": 0.8283914804458619, "reward_std": 0.4219228565692902, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.17839145064353942, "rewards/ReportKG_Jaccard/std": 0.04442351162433624, "step": 2900, "train_speed(iter/s)": 0.036917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 110.0, "completions/mean_length": 79.375, "completions/min_length": 55.0, "epoch": 0.5868686868686869, "grad_norm": 1.343788504600525, "kl": 0.039088986441493036, "learning_rate": 2.103087391223242e-06, "loss": -0.06931307911872864, "memory(GiB)": 69.34, "reward": 0.7253482580184937, "reward_std": 0.4316541016101837, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.12534829080104828, "rewards/ReportKG_Jaccard/std": 0.043901610746979715, "step": 2905, "train_speed(iter/s)": 0.036925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 68.475, "completions/min_length": 52.6, "epoch": 0.5878787878787879, "grad_norm": 1.5178302526474, "kl": 0.041052503511309624, "learning_rate": 2.0951638316474847e-06, "loss": 0.04305768013000488, "memory(GiB)": 69.34, "reward": 0.8089891612529755, "reward_std": 0.2761663980782032, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.19948650598526002, "rewards/ReportKG_Jaccard/mean": 0.35898915529251096, "rewards/ReportKG_Jaccard/std": 0.09012315049767494, "step": 2910, "train_speed(iter/s)": 0.036942 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 118.8, "completions/mean_length": 91.1, "completions/min_length": 70.6, "epoch": 0.5888888888888889, "grad_norm": 1.1380424499511719, "kl": 0.04468103237450123, "learning_rate": 2.087238774730672e-06, "loss": 0.059763312339782715, "memory(GiB)": 69.34, "reward": 1.00462464094162, "reward_std": 0.25389458537101744, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.22962463796138763, "rewards/ReportKG_Jaccard/std": 0.05664651095867157, "step": 2915, "train_speed(iter/s)": 0.036952 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 126.4, "completions/mean_length": 86.9, "completions/min_length": 63.4, "epoch": 0.5898989898989899, "grad_norm": 1.5211361646652222, "kl": 0.03291969895362854, "learning_rate": 2.079312345168425e-06, "loss": 0.08172808289527893, "memory(GiB)": 69.34, "reward": 0.8700944721698761, "reward_std": 0.40378412008285525, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3863525390625, "rewards/ReportKG_Jaccard/mean": 0.2450944811105728, "rewards/ReportKG_Jaccard/std": 0.08898513168096542, "step": 2920, "train_speed(iter/s)": 0.036961 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.6, "completions/mean_length": 70.3, "completions/min_length": 55.6, "epoch": 0.5909090909090909, "grad_norm": 1.3498698472976685, "kl": 0.05478955134749412, "learning_rate": 2.071384667677961e-06, "loss": 0.062479108572006226, "memory(GiB)": 69.34, "reward": 0.9969751596450805, "reward_std": 0.2690930888056755, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.27197516709566116, "rewards/ReportKG_Jaccard/std": 0.07165023237466812, "step": 2925, "train_speed(iter/s)": 0.03698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.0, "completions/mean_length": 66.025, "completions/min_length": 48.2, "epoch": 0.591919191919192, "grad_norm": 1.4550608396530151, "kl": 0.03804179728031158, "learning_rate": 2.0634558669961353e-06, "loss": -0.002162729203701019, "memory(GiB)": 69.34, "reward": 0.9174762845039368, "reward_std": 0.42299585938453677, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.299601674079895, "rewards/ReportKG_Jaccard/mean": 0.31747627854347227, "rewards/ReportKG_Jaccard/std": 0.13069515265524387, "step": 2930, "train_speed(iter/s)": 0.036979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.8, "completions/mean_length": 66.575, "completions/min_length": 49.4, "epoch": 0.592929292929293, "grad_norm": 1.8378777503967285, "kl": 0.049236280098557475, "learning_rate": 2.0555260678774747e-06, "loss": 0.005790994316339493, "memory(GiB)": 69.34, "reward": 0.6602058410644531, "reward_std": 0.409299236536026, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.2102058619260788, "rewards/ReportKG_Jaccard/std": 0.06463722176849843, "step": 2935, "train_speed(iter/s)": 0.03699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.0, "completions/mean_length": 69.575, "completions/min_length": 52.4, "epoch": 0.593939393939394, "grad_norm": 1.3860148191452026, "kl": 0.04165230244398117, "learning_rate": 2.047595395092215e-06, "loss": 0.09250714182853699, "memory(GiB)": 69.34, "reward": 0.4480092376470566, "reward_std": 0.28944685608148574, "rewards/MultiModalAccuracyORM_Any/mean": 0.2, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.24800923131406308, "rewards/ReportKG_Jaccard/std": 0.09309358038008213, "step": 2940, "train_speed(iter/s)": 0.036987 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.4, "completions/mean_length": 77.975, "completions/min_length": 65.0, "epoch": 0.5949494949494949, "grad_norm": 1.5473685264587402, "kl": 0.03607205376029014, "learning_rate": 2.03966397342434e-06, "loss": -0.0010296553373336792, "memory(GiB)": 69.34, "reward": 0.6384316146373749, "reward_std": 0.400027334690094, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.26343161463737486, "rewards/ReportKG_Jaccard/std": 0.06662994213402271, "step": 2945, "train_speed(iter/s)": 0.036988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.4, "completions/mean_length": 72.425, "completions/min_length": 53.2, "epoch": 0.5959595959595959, "grad_norm": 1.4124473333358765, "kl": 0.047810099087655544, "learning_rate": 2.031731927669616e-06, "loss": 0.009508136659860611, "memory(GiB)": 69.34, "reward": 0.7230499267578125, "reward_std": 0.35642276927828787, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.12304992973804474, "rewards/ReportKG_Jaccard/std": 0.05467298850417137, "step": 2950, "train_speed(iter/s)": 0.036982 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.2, "completions/mean_length": 72.325, "completions/min_length": 56.4, "epoch": 0.5969696969696969, "grad_norm": 1.0775891542434692, "kl": 0.056794007495045665, "learning_rate": 2.0237993826336286e-06, "loss": 0.05480659008026123, "memory(GiB)": 69.34, "reward": 0.7701055139303208, "reward_std": 0.38984134048223495, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.19510551244020463, "rewards/ReportKG_Jaccard/std": 0.0644174113869667, "step": 2955, "train_speed(iter/s)": 0.036999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.0, "completions/mean_length": 69.45, "completions/min_length": 52.0, "epoch": 0.597979797979798, "grad_norm": 1.3205026388168335, "kl": 0.02702869102358818, "learning_rate": 2.0158664631298193e-06, "loss": 0.04771896004676819, "memory(GiB)": 69.34, "reward": 0.8344072103500366, "reward_std": 0.4133100688457489, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.23440721333026887, "rewards/ReportKG_Jaccard/std": 0.07018213942646981, "step": 2960, "train_speed(iter/s)": 0.037009 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.8, "completions/mean_length": 73.0, "completions/min_length": 55.2, "epoch": 0.598989898989899, "grad_norm": 1.418331503868103, "kl": 0.04302375912666321, "learning_rate": 2.007933293977522e-06, "loss": 0.07904212474822998, "memory(GiB)": 69.34, "reward": 0.6822223365306854, "reward_std": 0.43078928142786027, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.23222232460975648, "rewards/ReportKG_Jaccard/std": 0.0872103676199913, "step": 2965, "train_speed(iter/s)": 0.037024 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.2, "completions/mean_length": 68.25, "completions/min_length": 53.2, "epoch": 0.6, "grad_norm": 1.4001513719558716, "kl": 0.02319426704198122, "learning_rate": 2e-06, "loss": 0.031166139245033263, "memory(GiB)": 69.34, "reward": 0.9736706733703613, "reward_std": 0.4188646003603935, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.3812558650970459, "rewards/ReportKG_Jaccard/mean": 0.29867070615291597, "rewards/ReportKG_Jaccard/std": 0.07284173592925072, "step": 2970, "train_speed(iter/s)": 0.037036 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 72.525, "completions/min_length": 48.6, "epoch": 0.601010101010101, "grad_norm": 1.1113262176513672, "kl": 0.038292941823601725, "learning_rate": 1.9920667060224774e-06, "loss": 0.05510597825050354, "memory(GiB)": 69.34, "reward": 0.46457222700119016, "reward_std": 0.4240479528903961, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.39729605317115785, "rewards/ReportKG_Jaccard/mean": 0.1895722158253193, "rewards/ReportKG_Jaccard/std": 0.06305970661342145, "step": 2975, "train_speed(iter/s)": 0.037029 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.4, "completions/mean_length": 74.075, "completions/min_length": 59.2, "epoch": 0.602020202020202, "grad_norm": 1.3163899183273315, "kl": 0.04856577068567276, "learning_rate": 1.984133536870181e-06, "loss": 0.028519445657730104, "memory(GiB)": 69.34, "reward": 0.90143324136734, "reward_std": 0.3838435083627701, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.2514332354068756, "rewards/ReportKG_Jaccard/std": 0.05814993716776371, "step": 2980, "train_speed(iter/s)": 0.037047 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.0, "completions/mean_length": 65.025, "completions/min_length": 51.6, "epoch": 0.603030303030303, "grad_norm": 1.2787046432495117, "kl": 0.02475835643708706, "learning_rate": 1.9762006173663716e-06, "loss": -0.028548997640609742, "memory(GiB)": 69.34, "reward": 0.7360802322626114, "reward_std": 0.19723442532122135, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.2110802412033081, "rewards/ReportKG_Jaccard/std": 0.07204401232302189, "step": 2985, "train_speed(iter/s)": 0.03703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 65.775, "completions/min_length": 47.0, "epoch": 0.604040404040404, "grad_norm": 1.351641297340393, "kl": 0.01945328079164028, "learning_rate": 1.968268072330384e-06, "loss": 0.00127863809466362, "memory(GiB)": 69.34, "reward": 0.49574858546257017, "reward_std": 0.20551005899906158, "rewards/MultiModalAccuracyORM_Any/mean": 0.25, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.24574858248233794, "rewards/ReportKG_Jaccard/std": 0.07754097953438759, "step": 2990, "train_speed(iter/s)": 0.037038 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.4, "completions/mean_length": 74.775, "completions/min_length": 51.4, "epoch": 0.6050505050505051, "grad_norm": 1.147788405418396, "kl": 0.03925382066518068, "learning_rate": 1.9603360265756602e-06, "loss": -0.00104018896818161, "memory(GiB)": 69.34, "reward": 0.8220908403396606, "reward_std": 0.43212844133377076, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.22209080159664155, "rewards/ReportKG_Jaccard/std": 0.07739903852343559, "step": 2995, "train_speed(iter/s)": 0.037023 }, { "epoch": 0.6060606060606061, "grad_norm": 1.1495405435562134, "learning_rate": 1.952404604907785e-06, "loss": 0.024305400252342225, "memory(GiB)": 69.34, "step": 3000, "train_speed(iter/s)": 0.037014 }, { "epoch": 0.6060606060606061, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 101.42, "eval_completions/mean_length": 76.515, "eval_completions/min_length": 56.78, "eval_kl": 0.03557205138728023, "eval_loss": 0.013191914185881615, "eval_reward": 0.7723606064915657, "eval_reward_std": 0.293953995667398, "eval_rewards/MultiModalAccuracyORM_Any/mean": 0.5575, "eval_rewards/MultiModalAccuracyORM_Any/std": 0.251661850810051, "eval_rewards/ReportKG_Jaccard/mean": 0.21486061155796052, "eval_rewards/ReportKG_Jaccard/std": 0.06203720111399889, "eval_runtime": 907.3195, "eval_samples_per_second": 0.055, "eval_steps_per_second": 0.008, "step": 3000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.1, "completions/mean_length": 80.2, "completions/min_length": 64.3, "epoch": 0.6070707070707071, "grad_norm": 1.1760483980178833, "kl": 0.03640040047466755, "learning_rate": 1.9444739321225255e-06, "loss": 0.02245338410139084, "memory(GiB)": 69.34, "reward": 0.7585436671972274, "reward_std": 0.27257357686758044, "rewards/MultiModalAccuracyORM_Any/mean": 0.5375, "rewards/MultiModalAccuracyORM_Any/std": 0.22220884561538695, "rewards/ReportKG_Jaccard/mean": 0.22104365825653077, "rewards/ReportKG_Jaccard/std": 0.07133823744952679, "step": 3005, "train_speed(iter/s)": 0.036572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.2, "completions/mean_length": 71.225, "completions/min_length": 55.0, "epoch": 0.6080808080808081, "grad_norm": 1.1090859174728394, "kl": 0.05673280730843544, "learning_rate": 1.936544133003865e-06, "loss": 0.10702905654907227, "memory(GiB)": 69.34, "reward": 0.918166995048523, "reward_std": 0.23318190276622772, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.1776151716709137, "rewards/ReportKG_Jaccard/mean": 0.19316698610782623, "rewards/ReportKG_Jaccard/std": 0.0958629347383976, "step": 3010, "train_speed(iter/s)": 0.036582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.4, "completions/mean_length": 78.375, "completions/min_length": 57.0, "epoch": 0.6090909090909091, "grad_norm": 1.0572545528411865, "kl": 0.032387949712574485, "learning_rate": 1.928615332322039e-06, "loss": 0.03807676732540131, "memory(GiB)": 69.34, "reward": 0.9482906877994537, "reward_std": 0.37767054438591, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.27329068779945376, "rewards/ReportKG_Jaccard/std": 0.07056338097900153, "step": 3015, "train_speed(iter/s)": 0.036574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.6, "completions/mean_length": 68.3, "completions/min_length": 58.0, "epoch": 0.6101010101010101, "grad_norm": 1.0449670553207397, "kl": 0.04183646589517594, "learning_rate": 1.9206876548315754e-06, "loss": 0.04717768728733063, "memory(GiB)": 69.34, "reward": 1.0957980513572694, "reward_std": 0.2559019260108471, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.3707980692386627, "rewards/ReportKG_Jaccard/std": 0.10651929080486297, "step": 3020, "train_speed(iter/s)": 0.036555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.2, "completions/mean_length": 68.975, "completions/min_length": 57.2, "epoch": 0.6111111111111112, "grad_norm": 0.9866381883621216, "kl": 0.04737714603543282, "learning_rate": 1.912761225269328e-06, "loss": 0.051834499835968016, "memory(GiB)": 69.34, "reward": 0.8570313334465027, "reward_std": 0.3799906626343727, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.23203133642673493, "rewards/ReportKG_Jaccard/std": 0.06391645297408104, "step": 3025, "train_speed(iter/s)": 0.036572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.4, "completions/mean_length": 67.35, "completions/min_length": 51.4, "epoch": 0.6121212121212121, "grad_norm": 1.1802133321762085, "kl": 0.03020840175449848, "learning_rate": 1.9048361683525153e-06, "loss": 0.038327348232269284, "memory(GiB)": 69.34, "reward": 0.6994688749313355, "reward_std": 0.47998310923576354, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.4410230278968811, "rewards/ReportKG_Jaccard/mean": 0.2494688779115677, "rewards/ReportKG_Jaccard/std": 0.07237548902630805, "step": 3030, "train_speed(iter/s)": 0.036578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.2, "completions/mean_length": 68.6, "completions/min_length": 46.0, "epoch": 0.6131313131313131, "grad_norm": 1.6822487115859985, "kl": 0.04603103399276733, "learning_rate": 1.8969126087767586e-06, "loss": 0.06079773902893067, "memory(GiB)": 69.34, "reward": 1.0824753284454345, "reward_std": 0.2278478041291237, "rewards/MultiModalAccuracyORM_Any/mean": 0.9, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.18247530311346055, "rewards/ReportKG_Jaccard/std": 0.06688360869884491, "step": 3035, "train_speed(iter/s)": 0.036592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 76.1, "completions/min_length": 57.8, "epoch": 0.6141414141414141, "grad_norm": 1.4333422183990479, "kl": 0.046260157600045204, "learning_rate": 1.88899067121412e-06, "loss": 0.037949991226196286, "memory(GiB)": 69.34, "reward": 0.9705267906188965, "reward_std": 0.38709217607975005, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.27052681148052216, "rewards/ReportKG_Jaccard/std": 0.0833638459444046, "step": 3040, "train_speed(iter/s)": 0.036597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.8, "completions/mean_length": 72.0, "completions/min_length": 56.6, "epoch": 0.6151515151515151, "grad_norm": 1.1212637424468994, "kl": 0.03538461476564407, "learning_rate": 1.8810704803111382e-06, "loss": 0.058896738290786746, "memory(GiB)": 69.34, "reward": 0.7585923731327057, "reward_std": 0.46873605251312256, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.2335923932492733, "rewards/ReportKG_Jaccard/std": 0.0483750730752945, "step": 3045, "train_speed(iter/s)": 0.036609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.4, "completions/mean_length": 79.075, "completions/min_length": 62.8, "epoch": 0.6161616161616161, "grad_norm": 1.911247968673706, "kl": 0.027115472964942454, "learning_rate": 1.8731521606868709e-06, "loss": 0.003924626111984253, "memory(GiB)": 69.34, "reward": 0.8152792274951934, "reward_std": 0.5011224687099457, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.4847656965255737, "rewards/ReportKG_Jaccard/mean": 0.2652792453765869, "rewards/ReportKG_Jaccard/std": 0.06861429587006569, "step": 3050, "train_speed(iter/s)": 0.036615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.8, "completions/mean_length": 71.0, "completions/min_length": 54.0, "epoch": 0.6171717171717171, "grad_norm": 1.3115075826644897, "kl": 0.047620901837944984, "learning_rate": 1.8652358369309332e-06, "loss": 0.031493422389030454, "memory(GiB)": 69.34, "reward": 0.9667828440666199, "reward_std": 0.37106369584798815, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.26678287088871, "rewards/ReportKG_Jaccard/std": 0.059428741410374644, "step": 3055, "train_speed(iter/s)": 0.036624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.6, "completions/mean_length": 79.275, "completions/min_length": 58.4, "epoch": 0.6181818181818182, "grad_norm": 1.2425354719161987, "kl": 0.034106526896357535, "learning_rate": 1.8573216336015353e-06, "loss": 0.04111878871917725, "memory(GiB)": 69.34, "reward": 0.8927270889282226, "reward_std": 0.3686291307210922, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.28112500309944155, "rewards/ReportKG_Jaccard/mean": 0.29272711277008057, "rewards/ReportKG_Jaccard/std": 0.1252473458647728, "step": 3060, "train_speed(iter/s)": 0.036626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.4, "completions/mean_length": 75.9, "completions/min_length": 57.2, "epoch": 0.6191919191919192, "grad_norm": 1.7622405290603638, "kl": 0.04634121730923653, "learning_rate": 1.8494096752235238e-06, "loss": 0.01960424780845642, "memory(GiB)": 69.34, "reward": 0.6727063238620759, "reward_std": 0.4408457696437836, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.222706338763237, "rewards/ReportKG_Jaccard/std": 0.06547513380646705, "step": 3065, "train_speed(iter/s)": 0.036634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.6, "completions/mean_length": 68.075, "completions/min_length": 56.2, "epoch": 0.6202020202020202, "grad_norm": 1.7075015306472778, "kl": 0.05217092558741569, "learning_rate": 1.8415000862864227e-06, "loss": 0.035765478014945985, "memory(GiB)": 69.34, "reward": 0.5118102580308914, "reward_std": 0.4181797236204147, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.40650616884231566, "rewards/ReportKG_Jaccard/mean": 0.16181024312973022, "rewards/ReportKG_Jaccard/std": 0.061005978286266326, "step": 3070, "train_speed(iter/s)": 0.036652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.2, "completions/mean_length": 78.15, "completions/min_length": 60.0, "epoch": 0.6212121212121212, "grad_norm": 1.2919996976852417, "kl": 0.03266846500337124, "learning_rate": 1.8335929912424755e-06, "loss": 0.05629806518554688, "memory(GiB)": 69.34, "reward": 0.676854532957077, "reward_std": 0.35401005297899246, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.3018545299768448, "rewards/ReportKG_Jaccard/std": 0.11024384126067162, "step": 3075, "train_speed(iter/s)": 0.036663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 79.15, "completions/min_length": 55.4, "epoch": 0.6222222222222222, "grad_norm": 1.1237876415252686, "kl": 0.04163816422224045, "learning_rate": 1.8256885145046834e-06, "loss": 0.03088322877883911, "memory(GiB)": 69.34, "reward": 0.8794128060340881, "reward_std": 0.4840639740228653, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.40650616884231566, "rewards/ReportKG_Jaccard/mean": 0.22941280603408815, "rewards/ReportKG_Jaccard/std": 0.09115951098501682, "step": 3080, "train_speed(iter/s)": 0.036663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.4, "completions/mean_length": 67.775, "completions/min_length": 47.2, "epoch": 0.6232323232323232, "grad_norm": 1.433934211730957, "kl": 0.047330130636692044, "learning_rate": 1.8177867804448525e-06, "loss": 0.0020619630813598633, "memory(GiB)": 69.34, "reward": 0.8524197995662689, "reward_std": 0.3731569230556488, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3484567105770111, "rewards/ReportKG_Jaccard/mean": 0.22741980254650115, "rewards/ReportKG_Jaccard/std": 0.04391477033495903, "step": 3085, "train_speed(iter/s)": 0.036652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 74.0, "completions/min_length": 59.0, "epoch": 0.6242424242424243, "grad_norm": 1.0972164869308472, "kl": 0.039367178454995155, "learning_rate": 1.809887913391635e-06, "loss": -0.00595422200858593, "memory(GiB)": 69.34, "reward": 0.6014943271875381, "reward_std": 0.3876572668552399, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.20149431824684144, "rewards/ReportKG_Jaccard/std": 0.04544955193996429, "step": 3090, "train_speed(iter/s)": 0.036654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 76.2, "completions/mean_length": 61.975, "completions/min_length": 48.4, "epoch": 0.6252525252525253, "grad_norm": 1.1199334859848022, "kl": 0.029750412702560423, "learning_rate": 1.8019920376285693e-06, "loss": -0.012619508802890778, "memory(GiB)": 69.34, "reward": 0.7771217346191406, "reward_std": 0.3918392986059189, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.20212172865867614, "rewards/ReportKG_Jaccard/std": 0.08316046744585037, "step": 3095, "train_speed(iter/s)": 0.036664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.0, "completions/mean_length": 72.025, "completions/min_length": 56.8, "epoch": 0.6262626262626263, "grad_norm": 0.8981653451919556, "kl": 0.05328359305858612, "learning_rate": 1.794099277392131e-06, "loss": 0.05638675689697266, "memory(GiB)": 69.34, "reward": 0.8099569797515869, "reward_std": 0.32251286804676055, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.28495696783065794, "rewards/ReportKG_Jaccard/std": 0.07139874584972858, "step": 3100, "train_speed(iter/s)": 0.036656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 79.8, "completions/mean_length": 62.425, "completions/min_length": 51.0, "epoch": 0.6272727272727273, "grad_norm": 1.1593016386032104, "kl": 0.030260414630174638, "learning_rate": 1.7862097568697747e-06, "loss": 0.021527813374996187, "memory(GiB)": 69.34, "reward": 0.6046199083328248, "reward_std": 0.4833982288837433, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.22961989045143127, "rewards/ReportKG_Jaccard/std": 0.05781119093298912, "step": 3105, "train_speed(iter/s)": 0.036654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.6, "completions/mean_length": 71.05, "completions/min_length": 55.2, "epoch": 0.6282828282828283, "grad_norm": 0.9541289210319519, "kl": 0.027087344974279403, "learning_rate": 1.778323600197978e-06, "loss": 0.0037292182445526124, "memory(GiB)": 69.34, "reward": 1.065853750705719, "reward_std": 0.36150954812765124, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.24085376858711244, "rewards/ReportKG_Jaccard/std": 0.05739547312259674, "step": 3110, "train_speed(iter/s)": 0.036671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.6, "completions/mean_length": 63.425, "completions/min_length": 46.6, "epoch": 0.6292929292929293, "grad_norm": 1.2247891426086426, "kl": 0.026671938970685004, "learning_rate": 1.7704409314602927e-06, "loss": 0.04545852839946747, "memory(GiB)": 69.34, "reward": 0.5253836303949356, "reward_std": 0.40212125778198243, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.15038364827632905, "rewards/ReportKG_Jaccard/std": 0.059728910028934476, "step": 3115, "train_speed(iter/s)": 0.036687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.4, "completions/mean_length": 66.675, "completions/min_length": 49.6, "epoch": 0.6303030303030303, "grad_norm": 1.4082295894622803, "kl": 0.029240479134023188, "learning_rate": 1.76256187468539e-06, "loss": -0.03309783935546875, "memory(GiB)": 69.34, "reward": 0.8237246394157409, "reward_std": 0.5068879842758178, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.1987246185541153, "rewards/ReportKG_Jaccard/std": 0.06386268027126789, "step": 3120, "train_speed(iter/s)": 0.036702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.2, "completions/mean_length": 72.85, "completions/min_length": 55.0, "epoch": 0.6313131313131313, "grad_norm": 1.3958321809768677, "kl": 0.029525811597704888, "learning_rate": 1.754686553845107e-06, "loss": 0.042358973622322084, "memory(GiB)": 69.34, "reward": 0.5832895874977112, "reward_std": 0.29736304432153704, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.2582895815372467, "rewards/ReportKG_Jaccard/std": 0.07422790676355362, "step": 3125, "train_speed(iter/s)": 0.036713 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.8, "completions/mean_length": 75.925, "completions/min_length": 52.8, "epoch": 0.6323232323232323, "grad_norm": 1.4727942943572998, "kl": 0.02469813022762537, "learning_rate": 1.7468150928525012e-06, "loss": -0.011866424977779389, "memory(GiB)": 69.34, "reward": 0.7993990778923035, "reward_std": 0.4049652352929115, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.2993990868330002, "rewards/ReportKG_Jaccard/std": 0.07983295917510987, "step": 3130, "train_speed(iter/s)": 0.036717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 79.2, "completions/mean_length": 66.65, "completions/min_length": 53.0, "epoch": 0.6333333333333333, "grad_norm": 1.2755159139633179, "kl": 0.02753332760185003, "learning_rate": 1.7389476155598973e-06, "loss": 0.006575850397348404, "memory(GiB)": 69.34, "reward": 0.9040530681610107, "reward_std": 0.2767082117497921, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.32905306816101076, "rewards/ReportKG_Jaccard/std": 0.08752235546708106, "step": 3135, "train_speed(iter/s)": 0.036733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 110.0, "completions/mean_length": 88.375, "completions/min_length": 62.8, "epoch": 0.6343434343434343, "grad_norm": 1.1238467693328857, "kl": 0.027814288064837454, "learning_rate": 1.7310842457569363e-06, "loss": -8.326414972543716e-05, "memory(GiB)": 69.34, "reward": 0.7137832462787628, "reward_std": 0.3277016207575798, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.13878326416015624, "rewards/ReportKG_Jaccard/std": 0.05798420235514641, "step": 3140, "train_speed(iter/s)": 0.036734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 66.1, "completions/min_length": 42.2, "epoch": 0.6353535353535353, "grad_norm": 1.7997115850448608, "kl": 0.026170283928513526, "learning_rate": 1.723225107168634e-06, "loss": 0.005198686569929123, "memory(GiB)": 69.34, "reward": 0.9594392776489258, "reward_std": 0.42988348603248594, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.20943929851055146, "rewards/ReportKG_Jaccard/std": 0.06375515572726727, "step": 3145, "train_speed(iter/s)": 0.03674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.6, "completions/mean_length": 79.825, "completions/min_length": 61.8, "epoch": 0.6363636363636364, "grad_norm": 1.1814112663269043, "kl": 0.04559694342315197, "learning_rate": 1.7153703234534298e-06, "loss": 0.05831131339073181, "memory(GiB)": 69.34, "reward": 1.0216470599174499, "reward_std": 0.2948886275291443, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.19664708971977235, "rewards/ReportKG_Jaccard/std": 0.061551763117313384, "step": 3150, "train_speed(iter/s)": 0.036736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.4, "completions/mean_length": 81.275, "completions/min_length": 62.6, "epoch": 0.6373737373737374, "grad_norm": 1.3064955472946167, "kl": 0.025246407091617584, "learning_rate": 1.7075200182012406e-06, "loss": 0.010842625796794892, "memory(GiB)": 69.34, "reward": 0.6423848748207093, "reward_std": 0.366947840154171, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.31392415761947634, "rewards/ReportKG_Jaccard/mean": 0.19238485991954804, "rewards/ReportKG_Jaccard/std": 0.06409067064523696, "step": 3155, "train_speed(iter/s)": 0.036746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.8, "completions/mean_length": 77.975, "completions/min_length": 58.2, "epoch": 0.6383838383838384, "grad_norm": 1.0947566032409668, "kl": 0.02598224878311157, "learning_rate": 1.6996743149315192e-06, "loss": 0.04588406383991241, "memory(GiB)": 69.34, "reward": 0.9868378877639771, "reward_std": 0.41939887404441833, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.39729605317115785, "rewards/ReportKG_Jaccard/mean": 0.26183792054653166, "rewards/ReportKG_Jaccard/std": 0.06163203418254852, "step": 3160, "train_speed(iter/s)": 0.036743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.6, "completions/mean_length": 70.875, "completions/min_length": 53.4, "epoch": 0.6393939393939394, "grad_norm": 1.5732895135879517, "kl": 0.023291292041540144, "learning_rate": 1.691833337091309e-06, "loss": -1.2268126010894775e-05, "memory(GiB)": 69.34, "reward": 0.5793582558631897, "reward_std": 0.4703631103038788, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.4519508481025696, "rewards/ReportKG_Jaccard/mean": 0.2043582648038864, "rewards/ReportKG_Jaccard/std": 0.04729876443743706, "step": 3165, "train_speed(iter/s)": 0.036738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.8, "completions/mean_length": 68.825, "completions/min_length": 50.6, "epoch": 0.6404040404040404, "grad_norm": 1.2286090850830078, "kl": 0.05049346201121807, "learning_rate": 1.6839972080533e-06, "loss": 0.03876896500587464, "memory(GiB)": 69.34, "reward": 0.9567021489143371, "reward_std": 0.3480044223368168, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.299601674079895, "rewards/ReportKG_Jaccard/mean": 0.25670215487480164, "rewards/ReportKG_Jaccard/std": 0.07748676240444183, "step": 3170, "train_speed(iter/s)": 0.03675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.8, "completions/mean_length": 68.25, "completions/min_length": 48.4, "epoch": 0.6414141414141414, "grad_norm": 1.1286799907684326, "kl": 0.03535272218286991, "learning_rate": 1.6761660511138922e-06, "loss": 0.07407874464988709, "memory(GiB)": 69.34, "reward": 0.9822226285934448, "reward_std": 0.2403483808040619, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.2072226285934448, "rewards/ReportKG_Jaccard/std": 0.06800147742033005, "step": 3175, "train_speed(iter/s)": 0.036766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 76.25, "completions/min_length": 58.8, "epoch": 0.6424242424242425, "grad_norm": 1.5702118873596191, "kl": 0.029927522130310535, "learning_rate": 1.668339989491252e-06, "loss": -0.03139708638191223, "memory(GiB)": 69.34, "reward": 0.4830657660961151, "reward_std": 0.35065808147192, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.18306574821472169, "rewards/ReportKG_Jaccard/std": 0.04794417060911656, "step": 3180, "train_speed(iter/s)": 0.036754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.2, "completions/mean_length": 74.6, "completions/min_length": 58.0, "epoch": 0.6434343434343435, "grad_norm": 1.549972653388977, "kl": 0.03932462893426418, "learning_rate": 1.6605191463233722e-06, "loss": 0.03136629164218903, "memory(GiB)": 69.34, "reward": 0.8014854311943054, "reward_std": 0.29688464775681495, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.22648542299866675, "rewards/ReportKG_Jaccard/std": 0.06313062459230423, "step": 3185, "train_speed(iter/s)": 0.036732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.2, "completions/mean_length": 59.825, "completions/min_length": 46.6, "epoch": 0.6444444444444445, "grad_norm": 1.7931171655654907, "kl": 0.03625096529722214, "learning_rate": 1.6527036446661393e-06, "loss": 0.043480494618415834, "memory(GiB)": 69.34, "reward": 0.9074513077735901, "reward_std": 0.32366577833890914, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2824513018131256, "rewards/ReportKG_Jaccard/std": 0.10263880640268326, "step": 3190, "train_speed(iter/s)": 0.03674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.2, "completions/mean_length": 74.175, "completions/min_length": 58.2, "epoch": 0.6454545454545455, "grad_norm": 0.9426072239875793, "kl": 0.03553901147097349, "learning_rate": 1.6448936074913938e-06, "loss": -0.05072459578514099, "memory(GiB)": 69.34, "reward": 0.6178061604499817, "reward_std": 0.33180115222930906, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.21780616492033006, "rewards/ReportKG_Jaccard/std": 0.06851602867245674, "step": 3195, "train_speed(iter/s)": 0.036756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.6, "completions/mean_length": 78.3, "completions/min_length": 62.8, "epoch": 0.6464646464646465, "grad_norm": 1.5891549587249756, "kl": 0.03499589078128338, "learning_rate": 1.6370891576849948e-06, "loss": -0.04180422723293305, "memory(GiB)": 69.34, "reward": 0.9186882138252258, "reward_std": 0.2704062402248383, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2686882227659225, "rewards/ReportKG_Jaccard/std": 0.05682169497013092, "step": 3200, "train_speed(iter/s)": 0.036749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 126.2, "completions/mean_length": 90.325, "completions/min_length": 64.2, "epoch": 0.6474747474747474, "grad_norm": 1.0881496667861938, "kl": 0.03337543234229088, "learning_rate": 1.62929041804489e-06, "loss": -0.0016032833606004714, "memory(GiB)": 69.34, "reward": 0.9526726722717285, "reward_std": 0.48414087295532227, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.45196654200553893, "rewards/ReportKG_Jaccard/mean": 0.20267268717288972, "rewards/ReportKG_Jaccard/std": 0.06238369271159172, "step": 3205, "train_speed(iter/s)": 0.036754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.2, "completions/mean_length": 77.175, "completions/min_length": 60.4, "epoch": 0.6484848484848484, "grad_norm": 1.116898775100708, "kl": 0.037266647815704344, "learning_rate": 1.6214975112791802e-06, "loss": 0.028068161010742186, "memory(GiB)": 69.34, "reward": 0.9355088263750077, "reward_std": 0.29040229320526123, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2355088323354721, "rewards/ReportKG_Jaccard/std": 0.07229256927967072, "step": 3210, "train_speed(iter/s)": 0.036762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 75.65, "completions/min_length": 63.0, "epoch": 0.6494949494949495, "grad_norm": 1.0297471284866333, "kl": 0.04394868668168783, "learning_rate": 1.6137105600041885e-06, "loss": 0.03737530708312988, "memory(GiB)": 69.34, "reward": 0.7514216542243958, "reward_std": 0.3079694002866745, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.20142168253660203, "rewards/ReportKG_Jaccard/std": 0.05885260924696922, "step": 3215, "train_speed(iter/s)": 0.036767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.6, "completions/mean_length": 69.475, "completions/min_length": 56.0, "epoch": 0.6505050505050505, "grad_norm": 1.5945912599563599, "kl": 0.03202822171151638, "learning_rate": 1.6059296867425341e-06, "loss": 0.018934991955757142, "memory(GiB)": 69.34, "reward": 0.7718079686164856, "reward_std": 0.4299927845597267, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3921836853027344, "rewards/ReportKG_Jaccard/mean": 0.17180797904729844, "rewards/ReportKG_Jaccard/std": 0.05223868787288666, "step": 3220, "train_speed(iter/s)": 0.036779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 134.8, "completions/mean_length": 97.35, "completions/min_length": 71.2, "epoch": 0.6515151515151515, "grad_norm": 1.123537540435791, "kl": 0.02749153636395931, "learning_rate": 1.598155013921202e-06, "loss": 0.02532670795917511, "memory(GiB)": 69.34, "reward": 1.0156516909599305, "reward_std": 0.28823547065258026, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.1656516522169113, "rewards/ReportKG_Jaccard/std": 0.05416441485285759, "step": 3225, "train_speed(iter/s)": 0.03678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 74.25, "completions/min_length": 58.0, "epoch": 0.6525252525252525, "grad_norm": 1.0198984146118164, "kl": 0.025003114342689516, "learning_rate": 1.590386663869619e-06, "loss": 0.05591369271278381, "memory(GiB)": 69.34, "reward": 0.8513223648071289, "reward_std": 0.26737114787101746, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.276322340965271, "rewards/ReportKG_Jaccard/std": 0.0738731049001217, "step": 3230, "train_speed(iter/s)": 0.036798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.4, "completions/mean_length": 63.125, "completions/min_length": 47.8, "epoch": 0.6535353535353535, "grad_norm": 1.0700066089630127, "kl": 0.057919809222221376, "learning_rate": 1.5826247588177223e-06, "loss": 0.04896347522735596, "memory(GiB)": 69.34, "reward": 0.9140642166137696, "reward_std": 0.4624829411506653, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.4410230278968811, "rewards/ReportKG_Jaccard/mean": 0.21406423151493073, "rewards/ReportKG_Jaccard/std": 0.06685897409915924, "step": 3235, "train_speed(iter/s)": 0.036812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.2, "completions/mean_length": 72.7, "completions/min_length": 55.6, "epoch": 0.6545454545454545, "grad_norm": 1.0539593696594238, "kl": 0.03766736574470997, "learning_rate": 1.5748694208940464e-06, "loss": 0.024513761699199676, "memory(GiB)": 69.34, "reward": 1.0435152649879456, "reward_std": 0.33261668831110003, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.2886738538742065, "rewards/ReportKG_Jaccard/mean": 0.26851524114608766, "rewards/ReportKG_Jaccard/std": 0.06411939449608325, "step": 3240, "train_speed(iter/s)": 0.036799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.0, "completions/mean_length": 76.75, "completions/min_length": 52.6, "epoch": 0.6555555555555556, "grad_norm": 0.8243998289108276, "kl": 0.05265835449099541, "learning_rate": 1.5671207721237943e-06, "loss": 0.003310282528400421, "memory(GiB)": 69.34, "reward": 0.6847774147987366, "reward_std": 0.3710427075624466, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.20977741628885269, "rewards/ReportKG_Jaccard/std": 0.09651854336261749, "step": 3245, "train_speed(iter/s)": 0.036803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.6, "completions/mean_length": 82.325, "completions/min_length": 62.4, "epoch": 0.6565656565656566, "grad_norm": 1.1686900854110718, "kl": 0.03809193782508373, "learning_rate": 1.5593789344269187e-06, "loss": -0.001249922439455986, "memory(GiB)": 69.34, "reward": 0.8746652692556381, "reward_std": 0.3821204364299774, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.174665230512619, "rewards/ReportKG_Jaccard/std": 0.0605463981628418, "step": 3250, "train_speed(iter/s)": 0.036817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.8, "completions/mean_length": 73.15, "completions/min_length": 55.8, "epoch": 0.6575757575757576, "grad_norm": 0.6319504976272583, "kl": 0.024245496653020383, "learning_rate": 1.5516440296162057e-06, "loss": -0.022039589285850526, "memory(GiB)": 69.34, "reward": 0.6489877194166184, "reward_std": 0.35036413222551344, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.2989877060055733, "rewards/ReportKG_Jaccard/std": 0.06884533613920212, "step": 3255, "train_speed(iter/s)": 0.03681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.8, "completions/mean_length": 76.15, "completions/min_length": 62.2, "epoch": 0.6585858585858586, "grad_norm": 0.9793215990066528, "kl": 0.01966285314410925, "learning_rate": 1.5439161793953574e-06, "loss": 0.017535687983036043, "memory(GiB)": 69.34, "reward": 1.0253162503242492, "reward_std": 0.49634761214256284, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.4410230278968811, "rewards/ReportKG_Jaccard/mean": 0.3253162860870361, "rewards/ReportKG_Jaccard/std": 0.08619294017553329, "step": 3260, "train_speed(iter/s)": 0.036796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.6, "completions/mean_length": 62.275, "completions/min_length": 49.2, "epoch": 0.6595959595959596, "grad_norm": 1.6822547912597656, "kl": 0.019861295260488986, "learning_rate": 1.5361955053570738e-06, "loss": -0.020702242851257324, "memory(GiB)": 69.34, "reward": 0.7043366074562073, "reward_std": 0.3834036782383919, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.2543365925550461, "rewards/ReportKG_Jaccard/std": 0.060450770705938336, "step": 3265, "train_speed(iter/s)": 0.036802 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 70.825, "completions/min_length": 50.2, "epoch": 0.6606060606060606, "grad_norm": 1.244743824005127, "kl": 0.03755063656717539, "learning_rate": 1.5284821289811452e-06, "loss": 0.019070181250572204, "memory(GiB)": 69.34, "reward": 0.8466306924819946, "reward_std": 0.30630243569612503, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.1216307058930397, "rewards/ReportKG_Jaccard/std": 0.06169168725609779, "step": 3270, "train_speed(iter/s)": 0.03679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.6, "completions/mean_length": 62.5, "completions/min_length": 47.8, "epoch": 0.6616161616161617, "grad_norm": 0.8737350702285767, "kl": 0.023045165836811064, "learning_rate": 1.520776171632538e-06, "loss": 0.08658780455589295, "memory(GiB)": 69.34, "reward": 1.1095451891422272, "reward_std": 0.19747945293784142, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.28454520255327226, "rewards/ReportKG_Jaccard/std": 0.1319945551455021, "step": 3275, "train_speed(iter/s)": 0.036775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.8, "completions/mean_length": 66.8, "completions/min_length": 52.8, "epoch": 0.6626262626262627, "grad_norm": 1.5967360734939575, "kl": 0.025660222209990025, "learning_rate": 1.5130777545594821e-06, "loss": -0.01053164228796959, "memory(GiB)": 69.34, "reward": 0.676762455701828, "reward_std": 0.3623202070593834, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.22676246911287307, "rewards/ReportKG_Jaccard/std": 0.055405231565237044, "step": 3280, "train_speed(iter/s)": 0.036778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.2, "completions/mean_length": 81.825, "completions/min_length": 58.6, "epoch": 0.6636363636363637, "grad_norm": 1.195202350616455, "kl": 0.036965499818325046, "learning_rate": 1.5053869988915687e-06, "loss": 0.025850903987884522, "memory(GiB)": 69.34, "reward": 0.6321325421333313, "reward_std": 0.373279245197773, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.2071325421333313, "rewards/ReportKG_Jaccard/std": 0.055473935604095456, "step": 3285, "train_speed(iter/s)": 0.03677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.6, "completions/mean_length": 77.175, "completions/min_length": 61.8, "epoch": 0.6646464646464646, "grad_norm": 0.8824169635772705, "kl": 0.030281320586800574, "learning_rate": 1.4977040256378416e-06, "loss": -0.03359757661819458, "memory(GiB)": 69.34, "reward": 0.673203706741333, "reward_std": 0.45029959082603455, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.42594102025032043, "rewards/ReportKG_Jaccard/mean": 0.24820370972156525, "rewards/ReportKG_Jaccard/std": 0.07234358005225658, "step": 3290, "train_speed(iter/s)": 0.036786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.4, "completions/mean_length": 73.075, "completions/min_length": 51.0, "epoch": 0.6656565656565656, "grad_norm": 0.9289829730987549, "kl": 0.03578100390732288, "learning_rate": 1.4900289556848918e-06, "loss": -0.020348012447357178, "memory(GiB)": 69.34, "reward": 1.0327626585960388, "reward_std": 0.30380218625068667, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.18276267051696776, "rewards/ReportKG_Jaccard/std": 0.056669986248016356, "step": 3295, "train_speed(iter/s)": 0.036803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.6, "completions/mean_length": 77.975, "completions/min_length": 62.0, "epoch": 0.6666666666666666, "grad_norm": 1.3250813484191895, "kl": 0.05358357280492783, "learning_rate": 1.4823619097949582e-06, "loss": 0.006289897114038467, "memory(GiB)": 69.34, "reward": 0.8941571652889252, "reward_std": 0.24569633826613427, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.16915716733783484, "rewards/ReportKG_Jaccard/std": 0.05031975582242012, "step": 3300, "train_speed(iter/s)": 0.036794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.8, "completions/mean_length": 71.875, "completions/min_length": 54.0, "epoch": 0.6676767676767676, "grad_norm": 1.078951120376587, "kl": 0.048220448568463324, "learning_rate": 1.474703008604027e-06, "loss": 0.09848667979240418, "memory(GiB)": 69.34, "reward": 0.8730157673358917, "reward_std": 0.244551682472229, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.24801576137542725, "rewards/ReportKG_Jaccard/std": 0.0679500311613083, "step": 3305, "train_speed(iter/s)": 0.036799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 73.275, "completions/min_length": 57.0, "epoch": 0.6686868686868687, "grad_norm": 1.1694278717041016, "kl": 0.03313102163374424, "learning_rate": 1.4670523726199302e-06, "loss": 0.007056770473718643, "memory(GiB)": 69.34, "reward": 0.9347275078296662, "reward_std": 0.3440787926316261, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.28112500309944155, "rewards/ReportKG_Jaccard/mean": 0.3347274959087372, "rewards/ReportKG_Jaccard/std": 0.08478497713804245, "step": 3310, "train_speed(iter/s)": 0.036817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.8, "completions/mean_length": 73.0, "completions/min_length": 58.0, "epoch": 0.6696969696969697, "grad_norm": 1.2307413816452026, "kl": 0.04171382077038288, "learning_rate": 1.4594101222204543e-06, "loss": 0.020469054579734802, "memory(GiB)": 69.34, "reward": 0.9732543110847474, "reward_std": 0.3357157588005066, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.2482543021440506, "rewards/ReportKG_Jaccard/std": 0.0751670978963375, "step": 3315, "train_speed(iter/s)": 0.036812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.0, "completions/mean_length": 71.5, "completions/min_length": 50.6, "epoch": 0.6707070707070707, "grad_norm": 1.3277442455291748, "kl": 0.04623520914465189, "learning_rate": 1.4517763776514452e-06, "loss": 0.03995850682258606, "memory(GiB)": 69.34, "reward": 0.8743650019168854, "reward_std": 0.44854367822408675, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3552303433418274, "rewards/ReportKG_Jaccard/mean": 0.2743650048971176, "rewards/ReportKG_Jaccard/std": 0.12401274889707566, "step": 3320, "train_speed(iter/s)": 0.03681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.6, "completions/mean_length": 84.9, "completions/min_length": 68.6, "epoch": 0.6717171717171717, "grad_norm": 1.505733847618103, "kl": 0.0443060677498579, "learning_rate": 1.4441512590249114e-06, "loss": 0.04281973838806152, "memory(GiB)": 69.34, "reward": 1.035963487625122, "reward_std": 0.28868658766150473, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.16096348613500594, "rewards/ReportKG_Jaccard/std": 0.05812877416610718, "step": 3325, "train_speed(iter/s)": 0.036793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.6, "completions/mean_length": 72.175, "completions/min_length": 57.4, "epoch": 0.6727272727272727, "grad_norm": 1.3544352054595947, "kl": 0.03946605548262596, "learning_rate": 1.4365348863171404e-06, "loss": 0.027430924773216247, "memory(GiB)": 69.34, "reward": 1.0340530157089234, "reward_std": 0.3990562424063683, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.2590530127286911, "rewards/ReportKG_Jaccard/std": 0.07547883465886115, "step": 3330, "train_speed(iter/s)": 0.036806 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.0, "completions/mean_length": 68.1, "completions/min_length": 52.6, "epoch": 0.6737373737373737, "grad_norm": 1.04230535030365, "kl": 0.019351877830922605, "learning_rate": 1.428927379366809e-06, "loss": -0.007206656038761139, "memory(GiB)": 69.34, "reward": 1.0749453485012055, "reward_std": 0.26352900713682176, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.3249453455209732, "rewards/ReportKG_Jaccard/std": 0.09811627008020878, "step": 3335, "train_speed(iter/s)": 0.03681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.4, "completions/mean_length": 73.075, "completions/min_length": 53.6, "epoch": 0.6747474747474748, "grad_norm": 1.0948346853256226, "kl": 0.05667140930891037, "learning_rate": 1.4213288578730947e-06, "loss": 0.07407131195068359, "memory(GiB)": 69.34, "reward": 1.22647545337677, "reward_std": 0.28261940330266955, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.30147544741630555, "rewards/ReportKG_Jaccard/std": 0.1109362117946148, "step": 3340, "train_speed(iter/s)": 0.036825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 81.875, "completions/min_length": 65.2, "epoch": 0.6757575757575758, "grad_norm": 1.4733636379241943, "kl": 0.046209819987416266, "learning_rate": 1.4137394413937957e-06, "loss": 0.09030764698982238, "memory(GiB)": 69.34, "reward": 0.9083969265222549, "reward_std": 0.295070943236351, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.20839693918824195, "rewards/ReportKG_Jaccard/std": 0.058908002078533174, "step": 3345, "train_speed(iter/s)": 0.036844 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.0, "completions/mean_length": 68.975, "completions/min_length": 51.2, "epoch": 0.6767676767676768, "grad_norm": 1.488986611366272, "kl": 0.040807997435331346, "learning_rate": 1.406159249343451e-06, "loss": 0.006400987505912781, "memory(GiB)": 69.34, "reward": 1.0518731713294982, "reward_std": 0.14585258513689042, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.09258201122283935, "rewards/ReportKG_Jaccard/mean": 0.30187316834926603, "rewards/ReportKG_Jaccard/std": 0.06762540191411973, "step": 3350, "train_speed(iter/s)": 0.036864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.8, "completions/mean_length": 74.7, "completions/min_length": 58.2, "epoch": 0.6777777777777778, "grad_norm": 1.0877149105072021, "kl": 0.0540477953851223, "learning_rate": 1.3985884009914539e-06, "loss": 0.004055076465010643, "memory(GiB)": 69.34, "reward": 0.9465835094451904, "reward_std": 0.45220582485198973, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.43348987102508546, "rewards/ReportKG_Jaccard/mean": 0.19658351093530654, "rewards/ReportKG_Jaccard/std": 0.051087409257888794, "step": 3355, "train_speed(iter/s)": 0.036876 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.0, "completions/mean_length": 75.625, "completions/min_length": 48.2, "epoch": 0.6787878787878788, "grad_norm": 0.8847001194953918, "kl": 0.059712163731455804, "learning_rate": 1.3910270154601862e-06, "loss": 0.06818729043006896, "memory(GiB)": 69.34, "reward": 0.7568828999996186, "reward_std": 0.4711708605289459, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.41916738748550414, "rewards/ReportKG_Jaccard/mean": 0.10688288770616054, "rewards/ReportKG_Jaccard/std": 0.06733739916235208, "step": 3360, "train_speed(iter/s)": 0.036891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 74.8, "completions/mean_length": 58.65, "completions/min_length": 45.8, "epoch": 0.6797979797979798, "grad_norm": 1.0405598878860474, "kl": 0.030540285632014275, "learning_rate": 1.3834752117231355e-06, "loss": 0.07788789868354798, "memory(GiB)": 69.34, "reward": 0.7660542726516724, "reward_std": 0.3470334351062775, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.24105428755283356, "rewards/ReportKG_Jaccard/std": 0.10325385704636574, "step": 3365, "train_speed(iter/s)": 0.036906 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 76.8, "completions/mean_length": 62.65, "completions/min_length": 52.8, "epoch": 0.6808080808080809, "grad_norm": 0.9801175594329834, "kl": 0.02537165880203247, "learning_rate": 1.375933108603026e-06, "loss": 0.03760623335838318, "memory(GiB)": 69.34, "reward": 0.7201439738273621, "reward_std": 0.31564081609249117, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.22014397606253625, "rewards/ReportKG_Jaccard/std": 0.07536918744444847, "step": 3370, "train_speed(iter/s)": 0.036926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.0, "completions/mean_length": 70.525, "completions/min_length": 54.0, "epoch": 0.6818181818181818, "grad_norm": 1.9834578037261963, "kl": 0.019019515812397005, "learning_rate": 1.3684008247699503e-06, "loss": 0.037494197487831116, "memory(GiB)": 69.34, "reward": 0.8015975534915925, "reward_std": 0.35325779020786285, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.3015975832939148, "rewards/ReportKG_Jaccard/std": 0.11104477792978287, "step": 3375, "train_speed(iter/s)": 0.036937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.6, "completions/mean_length": 63.25, "completions/min_length": 46.6, "epoch": 0.6828282828282828, "grad_norm": 1.1653497219085693, "kl": 0.03340048789978027, "learning_rate": 1.3608784787395002e-06, "loss": 0.015675652027130126, "memory(GiB)": 69.34, "reward": 0.7114969909191131, "reward_std": 0.4076120853424072, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.18649702072143554, "rewards/ReportKG_Jaccard/std": 0.08964361026883125, "step": 3380, "train_speed(iter/s)": 0.036954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.2, "completions/mean_length": 70.725, "completions/min_length": 53.8, "epoch": 0.6838383838383838, "grad_norm": 1.0898648500442505, "kl": 0.026143189705908298, "learning_rate": 1.3533661888709023e-06, "loss": 0.041209495067596434, "memory(GiB)": 69.34, "reward": 1.0352782368659974, "reward_std": 0.23529116362333297, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.3102782338857651, "rewards/ReportKG_Jaccard/std": 0.09185753092169761, "step": 3385, "train_speed(iter/s)": 0.036968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.4, "completions/mean_length": 68.15, "completions/min_length": 49.8, "epoch": 0.6848484848484848, "grad_norm": 1.365127682685852, "kl": 0.02941881213337183, "learning_rate": 1.3458640733651567e-06, "loss": 0.03956746459007263, "memory(GiB)": 69.34, "reward": 1.0229925870895387, "reward_std": 0.30106624215841293, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.3229926139116287, "rewards/ReportKG_Jaccard/std": 0.07111301869153977, "step": 3390, "train_speed(iter/s)": 0.036978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.2, "completions/mean_length": 71.625, "completions/min_length": 51.0, "epoch": 0.6858585858585858, "grad_norm": 2.2305662631988525, "kl": 0.029593056440353392, "learning_rate": 1.338372250263176e-06, "loss": -0.05397610068321228, "memory(GiB)": 69.34, "reward": 0.8879365980625152, "reward_std": 0.4559537172317505, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.41916738748550414, "rewards/ReportKG_Jaccard/mean": 0.18793660700321196, "rewards/ReportKG_Jaccard/std": 0.07109567299485206, "step": 3395, "train_speed(iter/s)": 0.036966 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.4, "completions/mean_length": 76.25, "completions/min_length": 53.6, "epoch": 0.6868686868686869, "grad_norm": 1.6170895099639893, "kl": 0.03647776860743761, "learning_rate": 1.3308908374439297e-06, "loss": 0.03995192050933838, "memory(GiB)": 69.34, "reward": 0.6956628143787384, "reward_std": 0.19097977727651597, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.1456628181040287, "rewards/ReportKG_Jaccard/std": 0.06638235673308372, "step": 3400, "train_speed(iter/s)": 0.036977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 74.075, "completions/min_length": 55.4, "epoch": 0.6878787878787879, "grad_norm": 1.0632355213165283, "kl": 0.034165669791400434, "learning_rate": 1.3234199526225856e-06, "loss": 0.008468679338693618, "memory(GiB)": 69.34, "reward": 0.7084107786417008, "reward_std": 0.3912784934043884, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.23341078758239747, "rewards/ReportKG_Jaccard/std": 0.05430796518921852, "step": 3405, "train_speed(iter/s)": 0.036975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.8, "completions/mean_length": 70.575, "completions/min_length": 55.6, "epoch": 0.6888888888888889, "grad_norm": 1.5238373279571533, "kl": 0.03313009981065988, "learning_rate": 1.3159597133486625e-06, "loss": 0.04387349486351013, "memory(GiB)": 69.34, "reward": 0.7729763269424439, "reward_std": 0.3981353521347046, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.17297632917761802, "rewards/ReportKG_Jaccard/std": 0.04222491458058357, "step": 3410, "train_speed(iter/s)": 0.036952 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.4, "completions/mean_length": 70.15, "completions/min_length": 49.0, "epoch": 0.6898989898989899, "grad_norm": 1.3719487190246582, "kl": 0.050012710690498355, "learning_rate": 1.3085102370041789e-06, "loss": 0.04165834784507751, "memory(GiB)": 69.34, "reward": 0.8274055987596511, "reward_std": 0.38886847496032717, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.2274056002497673, "rewards/ReportKG_Jaccard/std": 0.06497465893626213, "step": 3415, "train_speed(iter/s)": 0.03696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.0, "completions/mean_length": 83.025, "completions/min_length": 61.2, "epoch": 0.6909090909090909, "grad_norm": 0.9873687028884888, "kl": 0.02442213874310255, "learning_rate": 1.3010716408018035e-06, "loss": -0.011637432873249054, "memory(GiB)": 69.34, "reward": 0.6010152369737625, "reward_std": 0.38346799835562706, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.1510152444243431, "rewards/ReportKG_Jaccard/std": 0.06459518447518349, "step": 3420, "train_speed(iter/s)": 0.036964 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.2, "completions/mean_length": 81.4, "completions/min_length": 62.8, "epoch": 0.6919191919191919, "grad_norm": 1.090253472328186, "kl": 0.029772515781223773, "learning_rate": 1.2936440417830144e-06, "loss": 0.0030345942825078962, "memory(GiB)": 69.34, "reward": 0.6456736564636231, "reward_std": 0.32342192977666856, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.22067365646362305, "rewards/ReportKG_Jaccard/std": 0.06493729092180729, "step": 3425, "train_speed(iter/s)": 0.036976 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.2, "completions/mean_length": 69.675, "completions/min_length": 55.8, "epoch": 0.692929292929293, "grad_norm": 1.4521327018737793, "kl": 0.053627743385732174, "learning_rate": 1.2862275568162563e-06, "loss": 0.06786175966262817, "memory(GiB)": 69.34, "reward": 0.792878258228302, "reward_std": 0.3168135568499565, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.24287824630737304, "rewards/ReportKG_Jaccard/std": 0.0725253127515316, "step": 3430, "train_speed(iter/s)": 0.036997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.4, "completions/mean_length": 63.7, "completions/min_length": 48.0, "epoch": 0.693939393939394, "grad_norm": 1.2964872121810913, "kl": 0.03956865295767784, "learning_rate": 1.2788223025951007e-06, "loss": 0.0024944301694631577, "memory(GiB)": 69.34, "reward": 0.8725772559642792, "reward_std": 0.3071696855127811, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.27257726192474363, "rewards/ReportKG_Jaccard/std": 0.05368962325155735, "step": 3435, "train_speed(iter/s)": 0.037007 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.6, "completions/mean_length": 67.925, "completions/min_length": 57.6, "epoch": 0.694949494949495, "grad_norm": 1.6428104639053345, "kl": 0.05203548036515713, "learning_rate": 1.2714283956364112e-06, "loss": 0.02388574481010437, "memory(GiB)": 69.34, "reward": 0.9094224274158478, "reward_std": 0.31847703009843825, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.23442247807979583, "rewards/ReportKG_Jaccard/std": 0.08146887198090554, "step": 3440, "train_speed(iter/s)": 0.03703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.6, "completions/mean_length": 83.275, "completions/min_length": 65.4, "epoch": 0.695959595959596, "grad_norm": 1.569846749305725, "kl": 0.027678871527314186, "learning_rate": 1.26404595227851e-06, "loss": -0.02698746919631958, "memory(GiB)": 69.34, "reward": 1.021955132484436, "reward_std": 0.40895331650972366, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.27195516377687456, "rewards/ReportKG_Jaccard/std": 0.061066734790802005, "step": 3445, "train_speed(iter/s)": 0.037035 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.4, "completions/mean_length": 83.675, "completions/min_length": 62.6, "epoch": 0.696969696969697, "grad_norm": 1.5736697912216187, "kl": 0.02409764304757118, "learning_rate": 1.2566750886793452e-06, "loss": -0.03792165517807007, "memory(GiB)": 69.34, "reward": 0.7064051866531372, "reward_std": 0.3269012048840523, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.18140517249703408, "rewards/ReportKG_Jaccard/std": 0.04010780714452267, "step": 3450, "train_speed(iter/s)": 0.037032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 76.4, "completions/mean_length": 60.275, "completions/min_length": 48.2, "epoch": 0.697979797979798, "grad_norm": 0.8667895197868347, "kl": 0.0344265878200531, "learning_rate": 1.2493159208146665e-06, "loss": 0.01754707843065262, "memory(GiB)": 69.34, "reward": 0.921251118183136, "reward_std": 0.3834318071603775, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3552303433418274, "rewards/ReportKG_Jaccard/mean": 0.3212511032819748, "rewards/ReportKG_Jaccard/std": 0.09960766732692719, "step": 3455, "train_speed(iter/s)": 0.037043 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.6, "completions/mean_length": 73.0, "completions/min_length": 59.8, "epoch": 0.6989898989898989, "grad_norm": 1.132820963859558, "kl": 0.04046899527311325, "learning_rate": 1.2419685644761994e-06, "loss": 0.06334569454193115, "memory(GiB)": 69.34, "reward": 0.8780626952648163, "reward_std": 0.27152094841003416, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.25306269377470014, "rewards/ReportKG_Jaccard/std": 0.0707031860947609, "step": 3460, "train_speed(iter/s)": 0.037055 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.8, "completions/mean_length": 77.35, "completions/min_length": 57.2, "epoch": 0.7, "grad_norm": 1.834063172340393, "kl": 0.029978914186358452, "learning_rate": 1.2346331352698205e-06, "loss": 0.029344356060028075, "memory(GiB)": 69.34, "reward": 1.0267861366271973, "reward_std": 0.43028348684310913, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.22678615003824235, "rewards/ReportKG_Jaccard/std": 0.056295585632324216, "step": 3465, "train_speed(iter/s)": 0.037056 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 74.85, "completions/min_length": 57.6, "epoch": 0.701010101010101, "grad_norm": 1.958200216293335, "kl": 0.02668438255786896, "learning_rate": 1.2273097486137424e-06, "loss": 0.08852960467338562, "memory(GiB)": 69.34, "reward": 0.6415267705917358, "reward_std": 0.5476196348667145, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.49153932929039, "rewards/ReportKG_Jaccard/mean": 0.26652675271034243, "rewards/ReportKG_Jaccard/std": 0.08095155991613864, "step": 3470, "train_speed(iter/s)": 0.037058 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.2, "completions/mean_length": 70.475, "completions/min_length": 54.4, "epoch": 0.702020202020202, "grad_norm": 1.3420823812484741, "kl": 0.021964090690016747, "learning_rate": 1.2199985197366967e-06, "loss": 0.06707614660263062, "memory(GiB)": 69.34, "reward": 0.8710847914218902, "reward_std": 0.4472385346889496, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.1710847571492195, "rewards/ReportKG_Jaccard/std": 0.06595703586935997, "step": 3475, "train_speed(iter/s)": 0.037059 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 70.225, "completions/min_length": 54.8, "epoch": 0.703030303030303, "grad_norm": 1.391750454902649, "kl": 0.026002169400453568, "learning_rate": 1.2126995636761172e-06, "loss": 0.05072764754295349, "memory(GiB)": 69.34, "reward": 0.8496266841888428, "reward_std": 0.4297777071595192, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3737070143222809, "rewards/ReportKG_Jaccard/mean": 0.24962669014930725, "rewards/ReportKG_Jaccard/std": 0.07413640916347504, "step": 3480, "train_speed(iter/s)": 0.037077 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.4, "completions/mean_length": 68.6, "completions/min_length": 54.0, "epoch": 0.704040404040404, "grad_norm": 1.5621365308761597, "kl": 0.03693276159465313, "learning_rate": 1.2054129952763362e-06, "loss": -0.0018640995025634766, "memory(GiB)": 69.34, "reward": 0.6983231186866761, "reward_std": 0.35839069485664365, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.1733231097459793, "rewards/ReportKG_Jaccard/std": 0.04812990538775921, "step": 3485, "train_speed(iter/s)": 0.03708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 71.675, "completions/min_length": 51.8, "epoch": 0.705050505050505, "grad_norm": 0.9853915572166443, "kl": 0.05475051663815975, "learning_rate": 1.198138929186773e-06, "loss": 0.03159377872943878, "memory(GiB)": 69.34, "reward": 0.9247266292572022, "reward_std": 0.4556397795677185, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.2747266530990601, "rewards/ReportKG_Jaccard/std": 0.08088306710124016, "step": 3490, "train_speed(iter/s)": 0.037083 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.6, "completions/mean_length": 64.025, "completions/min_length": 49.4, "epoch": 0.706060606060606, "grad_norm": 1.343016505241394, "kl": 0.058009221032261846, "learning_rate": 1.1908774798601298e-06, "loss": 0.06614699363708496, "memory(GiB)": 69.34, "reward": 0.9454564213752746, "reward_std": 0.21578352004289628, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.12045642817392946, "rewards/ReportKG_Jaccard/std": 0.05691123027354479, "step": 3495, "train_speed(iter/s)": 0.037083 }, { "epoch": 0.7070707070707071, "grad_norm": 1.4505771398544312, "learning_rate": 1.1836287615505943e-06, "loss": 0.057090312242507935, "memory(GiB)": 69.34, "step": 3500, "train_speed(iter/s)": 0.037079 }, { "epoch": 0.7070707070707071, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 98.08, "eval_completions/mean_length": 75.15, "eval_completions/min_length": 55.2, "eval_kl": 0.03299759685993195, "eval_loss": 0.0039021249394863844, "eval_reward": 0.7665604239702225, "eval_reward_std": 0.27779658749699593, "eval_rewards/MultiModalAccuracyORM_Any/mean": 0.55, "eval_rewards/MultiModalAccuracyORM_Any/std": 0.23247122406959533, "eval_rewards/ReportKG_Jaccard/mean": 0.21656041972339155, "eval_rewards/ReportKG_Jaccard/std": 0.057591882888227704, "eval_runtime": 889.7079, "eval_samples_per_second": 0.056, "eval_steps_per_second": 0.008, "step": 3500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.5, "completions/mean_length": 76.4375, "completions/min_length": 56.0, "epoch": 0.7080808080808081, "grad_norm": 1.2432605028152466, "kl": 0.03313706563785672, "learning_rate": 1.1763928883120388e-06, "loss": 0.01941142976284027, "memory(GiB)": 69.34, "reward": 0.793296217918396, "reward_std": 0.405549842864275, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.36740462481975555, "rewards/ReportKG_Jaccard/mean": 0.21829618513584137, "rewards/ReportKG_Jaccard/std": 0.062169233709573744, "step": 3505, "train_speed(iter/s)": 0.036714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 89.2, "completions/mean_length": 69.975, "completions/min_length": 58.0, "epoch": 0.7090909090909091, "grad_norm": 1.4055143594741821, "kl": 0.021954662166535854, "learning_rate": 1.1691699739962272e-06, "loss": 0.038152444362640384, "memory(GiB)": 69.34, "reward": 0.5355944722890854, "reward_std": 0.4029425919055939, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.3863525390625, "rewards/ReportKG_Jaccard/mean": 0.21059448570013045, "rewards/ReportKG_Jaccard/std": 0.05971558913588524, "step": 3510, "train_speed(iter/s)": 0.036725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.0, "completions/mean_length": 66.825, "completions/min_length": 54.0, "epoch": 0.7101010101010101, "grad_norm": 1.6451117992401123, "kl": 0.027712633460760118, "learning_rate": 1.161960132251023e-06, "loss": 0.020328444242477418, "memory(GiB)": 69.34, "reward": 0.8532968878746032, "reward_std": 0.30846676528453826, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.25329691767692564, "rewards/ReportKG_Jaccard/std": 0.07094041034579276, "step": 3515, "train_speed(iter/s)": 0.036732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.4, "completions/mean_length": 82.3, "completions/min_length": 63.4, "epoch": 0.7111111111111111, "grad_norm": 1.875780463218689, "kl": 0.031783667579293254, "learning_rate": 1.1547634765186014e-06, "loss": 0.002102683484554291, "memory(GiB)": 69.34, "reward": 1.001711082458496, "reward_std": 0.37559500634670256, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.25171108543872833, "rewards/ReportKG_Jaccard/std": 0.06752791628241539, "step": 3520, "train_speed(iter/s)": 0.036743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.6, "completions/mean_length": 75.65, "completions/min_length": 60.2, "epoch": 0.7121212121212122, "grad_norm": 1.202955722808838, "kl": 0.026725856587290764, "learning_rate": 1.147580120033664e-06, "loss": 0.015469615161418915, "memory(GiB)": 69.34, "reward": 0.9401142418384552, "reward_std": 0.3865345671772957, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.2151142328977585, "rewards/ReportKG_Jaccard/std": 0.07647521942853927, "step": 3525, "train_speed(iter/s)": 0.036747 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.0, "completions/mean_length": 75.45, "completions/min_length": 51.8, "epoch": 0.7131313131313132, "grad_norm": 1.5866053104400635, "kl": 0.02306144069880247, "learning_rate": 1.1404101758216566e-06, "loss": -0.05756632089614868, "memory(GiB)": 69.34, "reward": 0.7847530364990234, "reward_std": 0.469437712430954, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.45196654200553893, "rewards/ReportKG_Jaccard/mean": 0.23475304394960403, "rewards/ReportKG_Jaccard/std": 0.0744501544162631, "step": 3530, "train_speed(iter/s)": 0.036751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.6, "completions/mean_length": 70.575, "completions/min_length": 49.8, "epoch": 0.7141414141414142, "grad_norm": 1.2715249061584473, "kl": 0.038750671222805975, "learning_rate": 1.1332537566969942e-06, "loss": 0.0750929594039917, "memory(GiB)": 69.34, "reward": 1.0695464372634889, "reward_std": 0.3021655842661858, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.3445464432239532, "rewards/ReportKG_Jaccard/std": 0.086357381939888, "step": 3535, "train_speed(iter/s)": 0.036736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.6, "completions/mean_length": 74.5, "completions/min_length": 61.8, "epoch": 0.7151515151515152, "grad_norm": 0.9199817180633545, "kl": 0.04600386656820774, "learning_rate": 1.126110975261281e-06, "loss": 0.012851661443710327, "memory(GiB)": 69.34, "reward": 1.067218041419983, "reward_std": 0.30801935493946075, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.24221805036067962, "rewards/ReportKG_Jaccard/std": 0.051562849804759024, "step": 3540, "train_speed(iter/s)": 0.036732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.6, "completions/mean_length": 88.425, "completions/min_length": 70.2, "epoch": 0.7161616161616161, "grad_norm": 1.4035183191299438, "kl": 0.02605426274240017, "learning_rate": 1.118981943901541e-06, "loss": 0.033580005168914795, "memory(GiB)": 69.34, "reward": 1.0678370594978333, "reward_std": 0.26955184936523435, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.21380898952484131, "rewards/ReportKG_Jaccard/mean": 0.26783704236149786, "rewards/ReportKG_Jaccard/std": 0.07170215174555779, "step": 3545, "train_speed(iter/s)": 0.036734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.4, "completions/mean_length": 75.775, "completions/min_length": 56.6, "epoch": 0.7171717171717171, "grad_norm": 1.0815587043762207, "kl": 0.030934570729732512, "learning_rate": 1.1118667747884516e-06, "loss": 0.07841631174087524, "memory(GiB)": 69.34, "reward": 0.8859682321548462, "reward_std": 0.42320995330810546, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3897472023963928, "rewards/ReportKG_Jaccard/mean": 0.23596819788217543, "rewards/ReportKG_Jaccard/std": 0.0837235376238823, "step": 3550, "train_speed(iter/s)": 0.03674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.0, "completions/mean_length": 67.625, "completions/min_length": 52.0, "epoch": 0.7181818181818181, "grad_norm": 1.2447683811187744, "kl": 0.033139287307858464, "learning_rate": 1.104765579874575e-06, "loss": 0.015027821063995361, "memory(GiB)": 69.34, "reward": 1.004491138458252, "reward_std": 0.42189290672540664, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.30449112951755525, "rewards/ReportKG_Jaccard/std": 0.12355592399835587, "step": 3555, "train_speed(iter/s)": 0.036732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.6, "completions/mean_length": 75.55, "completions/min_length": 57.0, "epoch": 0.7191919191919192, "grad_norm": 1.2534379959106445, "kl": 0.03233785890042782, "learning_rate": 1.097678470892599e-06, "loss": -0.021684107184410096, "memory(GiB)": 69.34, "reward": 0.8971928358078003, "reward_std": 0.4013035759329796, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.19719281792640686, "rewards/ReportKG_Jaccard/std": 0.07650401070713997, "step": 3560, "train_speed(iter/s)": 0.036743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.2, "completions/mean_length": 75.975, "completions/min_length": 60.4, "epoch": 0.7202020202020202, "grad_norm": 1.417561650276184, "kl": 0.03172037564218044, "learning_rate": 1.0906055593535775e-06, "loss": 5.628913640975952e-05, "memory(GiB)": 69.34, "reward": 0.6618461072444916, "reward_std": 0.3777580827474594, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.18684611320495606, "rewards/ReportKG_Jaccard/std": 0.06320227682590485, "step": 3565, "train_speed(iter/s)": 0.036753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.0, "completions/mean_length": 69.425, "completions/min_length": 53.4, "epoch": 0.7212121212121212, "grad_norm": 1.0834516286849976, "kl": 0.04733262583613396, "learning_rate": 1.0835469565451792e-06, "loss": 0.0706303596496582, "memory(GiB)": 69.34, "reward": 0.9026591062545777, "reward_std": 0.38320941627025606, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.3026591032743454, "rewards/ReportKG_Jaccard/std": 0.09535691738128663, "step": 3570, "train_speed(iter/s)": 0.036753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.2, "completions/mean_length": 74.775, "completions/min_length": 56.0, "epoch": 0.7222222222222222, "grad_norm": 1.0845285654067993, "kl": 0.03895643539726734, "learning_rate": 1.0765027735299326e-06, "loss": 0.023078452050685882, "memory(GiB)": 69.34, "reward": 1.0202895760536195, "reward_std": 0.3582515761256218, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.19528957605361938, "rewards/ReportKG_Jaccard/std": 0.052873679995536806, "step": 3575, "train_speed(iter/s)": 0.036763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.6, "completions/mean_length": 77.5, "completions/min_length": 57.8, "epoch": 0.7232323232323232, "grad_norm": 1.5971091985702515, "kl": 0.025134269148111343, "learning_rate": 1.0694731211434786e-06, "loss": 0.004552432149648666, "memory(GiB)": 69.34, "reward": 0.899526047706604, "reward_std": 0.3730612605810165, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.30299633741378784, "rewards/ReportKG_Jaccard/mean": 0.2245260387659073, "rewards/ReportKG_Jaccard/std": 0.0802522599697113, "step": 3580, "train_speed(iter/s)": 0.036761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.8, "completions/mean_length": 78.85, "completions/min_length": 60.8, "epoch": 0.7242424242424242, "grad_norm": 1.4041402339935303, "kl": 0.021059132553637026, "learning_rate": 1.0624581099928323e-06, "loss": 0.034336197376251223, "memory(GiB)": 69.34, "reward": 0.6613422334194183, "reward_std": 0.23923420757055283, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.23634225726127625, "rewards/ReportKG_Jaccard/std": 0.08635939806699752, "step": 3585, "train_speed(iter/s)": 0.036769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.8, "completions/mean_length": 85.0, "completions/min_length": 62.2, "epoch": 0.7252525252525253, "grad_norm": 1.564273476600647, "kl": 0.027898553013801574, "learning_rate": 1.0554578504546349e-06, "loss": -0.030665796995162965, "memory(GiB)": 69.34, "reward": 0.8555998384952546, "reward_std": 0.3901869595050812, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.18059984743595123, "rewards/ReportKG_Jaccard/std": 0.0691125102341175, "step": 3590, "train_speed(iter/s)": 0.036768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.6, "completions/mean_length": 67.15, "completions/min_length": 53.6, "epoch": 0.7262626262626263, "grad_norm": 1.317039132118225, "kl": 0.029932988807559013, "learning_rate": 1.0484724526734202e-06, "loss": 0.048666125535964964, "memory(GiB)": 69.34, "reward": 0.9710591435432434, "reward_std": 0.26909763514995577, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.1776151716709137, "rewards/ReportKG_Jaccard/mean": 0.2960591435432434, "rewards/ReportKG_Jaccard/std": 0.10271954461932183, "step": 3595, "train_speed(iter/s)": 0.036775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.2, "completions/mean_length": 66.475, "completions/min_length": 51.8, "epoch": 0.7272727272727273, "grad_norm": 1.020113468170166, "kl": 0.03711164817214012, "learning_rate": 1.0415020265598871e-06, "loss": 0.045555087924003604, "memory(GiB)": 69.34, "reward": 1.1304157018661498, "reward_std": 0.2293233796954155, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.1776151716709137, "rewards/ReportKG_Jaccard/mean": 0.25541568696498873, "rewards/ReportKG_Jaccard/std": 0.06554323807358742, "step": 3600, "train_speed(iter/s)": 0.036785 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.6, "completions/mean_length": 80.375, "completions/min_length": 66.0, "epoch": 0.7282828282828283, "grad_norm": 1.2401599884033203, "kl": 0.034268707409501074, "learning_rate": 1.034546681789157e-06, "loss": -0.008707968890666962, "memory(GiB)": 69.34, "reward": 0.6419754743576049, "reward_std": 0.46962887048721313, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.46289436221122743, "rewards/ReportKG_Jaccard/mean": 0.2169754695147276, "rewards/ReportKG_Jaccard/std": 0.04926264397799969, "step": 3605, "train_speed(iter/s)": 0.036776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.4, "completions/mean_length": 84.675, "completions/min_length": 65.2, "epoch": 0.7292929292929293, "grad_norm": 0.882712185382843, "kl": 0.0336054839193821, "learning_rate": 1.027606527799063e-06, "loss": 0.016467830538749693, "memory(GiB)": 69.34, "reward": 1.16866694688797, "reward_std": 0.22287142053246498, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.2436669409275055, "rewards/ReportKG_Jaccard/std": 0.07256494089961052, "step": 3610, "train_speed(iter/s)": 0.036771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.2, "completions/mean_length": 69.65, "completions/min_length": 51.2, "epoch": 0.7303030303030303, "grad_norm": 1.1017190217971802, "kl": 0.026272835955023766, "learning_rate": 1.020681673788418e-06, "loss": 0.01870792210102081, "memory(GiB)": 69.34, "reward": 0.6953805327415467, "reward_std": 0.2911605477333069, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.19538049697875975, "rewards/ReportKG_Jaccard/std": 0.07073552533984184, "step": 3615, "train_speed(iter/s)": 0.036784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.6, "completions/mean_length": 68.45, "completions/min_length": 49.0, "epoch": 0.7313131313131314, "grad_norm": 1.250342607498169, "kl": 0.04495018497109413, "learning_rate": 1.0137722287152994e-06, "loss": 0.05291804075241089, "memory(GiB)": 69.34, "reward": 1.1027877926826477, "reward_std": 0.22968070656061174, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.25278778970241544, "rewards/ReportKG_Jaccard/std": 0.07091316655278206, "step": 3620, "train_speed(iter/s)": 0.036792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.4, "completions/mean_length": 75.0, "completions/min_length": 46.8, "epoch": 0.7323232323232324, "grad_norm": 1.4240037202835083, "kl": 0.025190436840057374, "learning_rate": 1.006878301295338e-06, "loss": -0.04872236847877502, "memory(GiB)": 69.34, "reward": 0.5409855902194977, "reward_std": 0.52230783700943, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.16598560363054277, "rewards/ReportKG_Jaccard/std": 0.06832100450992584, "step": 3625, "train_speed(iter/s)": 0.036804 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.6, "completions/mean_length": 74.475, "completions/min_length": 52.4, "epoch": 0.7333333333333333, "grad_norm": 1.7408397197723389, "kl": 0.02909595277160406, "learning_rate": 1.0000000000000004e-06, "loss": 0.05570317506790161, "memory(GiB)": 69.34, "reward": 0.4892177850008011, "reward_std": 0.3111743815243244, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.1642177700996399, "rewards/ReportKG_Jaccard/std": 0.07100460901856423, "step": 3630, "train_speed(iter/s)": 0.036818 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.2, "completions/mean_length": 78.4, "completions/min_length": 57.6, "epoch": 0.7343434343434343, "grad_norm": 1.3618595600128174, "kl": 0.0352958295494318, "learning_rate": 9.93137433054888e-07, "loss": 0.05947468280792236, "memory(GiB)": 69.34, "reward": 0.7721140325069428, "reward_std": 0.41868910044431684, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.1971140369772911, "rewards/ReportKG_Jaccard/std": 0.08494699001312256, "step": 3635, "train_speed(iter/s)": 0.036821 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.2, "completions/mean_length": 71.175, "completions/min_length": 51.2, "epoch": 0.7353535353535353, "grad_norm": 1.2971068620681763, "kl": 0.026985886693000793, "learning_rate": 9.862907084380344e-07, "loss": 0.0017992204055190086, "memory(GiB)": 69.34, "reward": 0.8437908232212067, "reward_std": 0.29777069091796876, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.26879081279039385, "rewards/ReportKG_Jaccard/std": 0.06799322068691253, "step": 3640, "train_speed(iter/s)": 0.036825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 71.525, "completions/min_length": 44.8, "epoch": 0.7363636363636363, "grad_norm": 1.097058892250061, "kl": 0.027067912742495537, "learning_rate": 9.79459933878201e-07, "loss": 0.06317728161811828, "memory(GiB)": 69.34, "reward": 0.6740954756736756, "reward_std": 0.3683285877108574, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.17409546077251434, "rewards/ReportKG_Jaccard/std": 0.08895271867513657, "step": 3645, "train_speed(iter/s)": 0.036819 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 80.4, "completions/mean_length": 62.175, "completions/min_length": 49.0, "epoch": 0.7373737373737373, "grad_norm": 1.3343019485473633, "kl": 0.03450812175869942, "learning_rate": 9.726452168531878e-07, "loss": 0.08436774015426636, "memory(GiB)": 69.34, "reward": 0.7922736763954162, "reward_std": 0.4405760645866394, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.19227368235588074, "rewards/ReportKG_Jaccard/std": 0.05714343748986721, "step": 3650, "train_speed(iter/s)": 0.036825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.8, "completions/mean_length": 75.175, "completions/min_length": 56.6, "epoch": 0.7383838383838384, "grad_norm": 1.2246730327606201, "kl": 0.025043205916881563, "learning_rate": 9.65846664588138e-07, "loss": -0.013337436318397521, "memory(GiB)": 69.34, "reward": 0.919906222820282, "reward_std": 0.4824563145637512, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.4519508481025696, "rewards/ReportKG_Jaccard/mean": 0.2949062168598175, "rewards/ReportKG_Jaccard/std": 0.07406804040074348, "step": 3655, "train_speed(iter/s)": 0.036825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.4, "completions/mean_length": 73.525, "completions/min_length": 53.0, "epoch": 0.7393939393939394, "grad_norm": 1.1536204814910889, "kl": 0.03348603341728449, "learning_rate": 9.590643840538557e-07, "loss": 0.09382997155189514, "memory(GiB)": 69.34, "reward": 1.02842378616333, "reward_std": 0.40070369094610214, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.3034237861633301, "rewards/ReportKG_Jaccard/std": 0.06229583472013474, "step": 3660, "train_speed(iter/s)": 0.036826 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/mean_length": 69.275, "completions/min_length": 51.8, "epoch": 0.7404040404040404, "grad_norm": 0.8857042789459229, "kl": 0.02579685375094414, "learning_rate": 9.522984819651172e-07, "loss": 0.06541651487350464, "memory(GiB)": 69.34, "reward": 0.6853220939636231, "reward_std": 0.33028142601251603, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.23532211780548096, "rewards/ReportKG_Jaccard/std": 0.07727588042616844, "step": 3665, "train_speed(iter/s)": 0.036824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 69.975, "completions/min_length": 52.2, "epoch": 0.7414141414141414, "grad_norm": 1.1367743015289307, "kl": 0.029237336292862893, "learning_rate": 9.45549064778995e-07, "loss": 0.030917856097221374, "memory(GiB)": 69.34, "reward": 0.701179850101471, "reward_std": 0.4118255376815796, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.25117988884449005, "rewards/ReportKG_Jaccard/std": 0.055550896748900416, "step": 3670, "train_speed(iter/s)": 0.036833 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.8, "completions/mean_length": 66.975, "completions/min_length": 53.4, "epoch": 0.7424242424242424, "grad_norm": 1.4032155275344849, "kl": 0.027072552032768725, "learning_rate": 9.388162386931842e-07, "loss": 0.05402673482894897, "memory(GiB)": 69.34, "reward": 1.1705083966255188, "reward_std": 0.38007327616214753, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.3955083966255188, "rewards/ReportKG_Jaccard/std": 0.12857754677534103, "step": 3675, "train_speed(iter/s)": 0.03685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.6, "completions/mean_length": 83.225, "completions/min_length": 61.8, "epoch": 0.7434343434343434, "grad_norm": 1.0196242332458496, "kl": 0.038567826896905896, "learning_rate": 9.321001096443279e-07, "loss": 0.0017467081546783448, "memory(GiB)": 69.34, "reward": 0.7622594118118287, "reward_std": 0.4513046622276306, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.4410230278968811, "rewards/ReportKG_Jaccard/mean": 0.16225941330194474, "rewards/ReportKG_Jaccard/std": 0.04374707266688347, "step": 3680, "train_speed(iter/s)": 0.036853 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.8, "completions/mean_length": 71.975, "completions/min_length": 49.6, "epoch": 0.7444444444444445, "grad_norm": 1.0372751951217651, "kl": 0.04718552194535732, "learning_rate": 9.25400783306352e-07, "loss": 0.02261899709701538, "memory(GiB)": 69.34, "reward": 0.8520722270011902, "reward_std": 0.3792333349585533, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.1770722448825836, "rewards/ReportKG_Jaccard/std": 0.057791562750935555, "step": 3685, "train_speed(iter/s)": 0.036855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.8, "completions/mean_length": 76.375, "completions/min_length": 55.6, "epoch": 0.7454545454545455, "grad_norm": 0.9347218871116638, "kl": 0.024520765617489816, "learning_rate": 9.187183650888055e-07, "loss": 0.03106388747692108, "memory(GiB)": 69.34, "reward": 0.8513548135757446, "reward_std": 0.4081577599048615, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.25135481655597686, "rewards/ReportKG_Jaccard/std": 0.08583913035690785, "step": 3690, "train_speed(iter/s)": 0.036855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 117.0, "completions/mean_length": 85.175, "completions/min_length": 58.2, "epoch": 0.7464646464646465, "grad_norm": 1.5373140573501587, "kl": 0.018693983554840088, "learning_rate": 9.12052960135194e-07, "loss": -0.025640535354614257, "memory(GiB)": 69.34, "reward": 0.8648226499557495, "reward_std": 0.44583268761634826, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.21482264399528503, "rewards/ReportKG_Jaccard/std": 0.056585357338190076, "step": 3695, "train_speed(iter/s)": 0.036856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.4, "completions/mean_length": 75.3, "completions/min_length": 53.2, "epoch": 0.7474747474747475, "grad_norm": 1.4108362197875977, "kl": 0.029060322046279907, "learning_rate": 9.054046733213357e-07, "loss": 0.04600246548652649, "memory(GiB)": 69.34, "reward": 0.8289653897285462, "reward_std": 0.4437500834465027, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.3955783486366272, "rewards/ReportKG_Jaccard/mean": 0.2039654016494751, "rewards/ReportKG_Jaccard/std": 0.09787668287754059, "step": 3700, "train_speed(iter/s)": 0.036864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/mean_length": 75.125, "completions/min_length": 56.2, "epoch": 0.7484848484848485, "grad_norm": 1.5092520713806152, "kl": 0.045819733664393424, "learning_rate": 8.987736092537028e-07, "loss": 0.07005739808082581, "memory(GiB)": 69.34, "reward": 0.884764963388443, "reward_std": 0.413567054271698, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3897472023963928, "rewards/ReportKG_Jaccard/mean": 0.23476491272449493, "rewards/ReportKG_Jaccard/std": 0.06038282439112663, "step": 3705, "train_speed(iter/s)": 0.036866 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.6, "completions/mean_length": 74.125, "completions/min_length": 60.4, "epoch": 0.7494949494949495, "grad_norm": 1.088098168373108, "kl": 0.02216991372406483, "learning_rate": 8.921598722677794e-07, "loss": -0.010064509510993958, "memory(GiB)": 69.34, "reward": 0.7941551029682159, "reward_std": 0.33050562888383866, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.28112500309944155, "rewards/ReportKG_Jaccard/mean": 0.19415510892868043, "rewards/ReportKG_Jaccard/std": 0.07156263068318366, "step": 3710, "train_speed(iter/s)": 0.036875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.0, "completions/mean_length": 70.75, "completions/min_length": 49.4, "epoch": 0.7505050505050505, "grad_norm": 1.2025562524795532, "kl": 0.02988172769546509, "learning_rate": 8.855635664264208e-07, "loss": 0.08828376531600952, "memory(GiB)": 69.34, "reward": 1.0088648557662965, "reward_std": 0.3328052133321762, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.15886483192443848, "rewards/ReportKG_Jaccard/std": 0.051213746517896654, "step": 3715, "train_speed(iter/s)": 0.036877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.2, "completions/mean_length": 72.375, "completions/min_length": 54.0, "epoch": 0.7515151515151515, "grad_norm": 0.9767293334007263, "kl": 0.025577415153384207, "learning_rate": 8.789847955182117e-07, "loss": 0.0024302484467625617, "memory(GiB)": 69.34, "reward": 0.7749355733394623, "reward_std": 0.4677649259567261, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.2249355733394623, "rewards/ReportKG_Jaccard/std": 0.07434638105332851, "step": 3720, "train_speed(iter/s)": 0.036878 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.8, "completions/mean_length": 79.4, "completions/min_length": 58.0, "epoch": 0.7525252525252525, "grad_norm": 1.5457054376602173, "kl": 0.027506164461374282, "learning_rate": 8.724236630558392e-07, "loss": 0.010048232972621918, "memory(GiB)": 69.34, "reward": 0.6568981349468231, "reward_std": 0.35055644512176515, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.1568981595337391, "rewards/ReportKG_Jaccard/std": 0.051001697033643725, "step": 3725, "train_speed(iter/s)": 0.03688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.2, "completions/mean_length": 63.7, "completions/min_length": 48.4, "epoch": 0.7535353535353535, "grad_norm": 1.8414695262908936, "kl": 0.04615763090550899, "learning_rate": 8.658802722744587e-07, "loss": 0.056430160999298096, "memory(GiB)": 69.34, "reward": 0.7436659187078476, "reward_std": 0.361978317797184, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.2436659336090088, "rewards/ReportKG_Jaccard/std": 0.06304063983261585, "step": 3730, "train_speed(iter/s)": 0.036881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.0, "completions/mean_length": 79.525, "completions/min_length": 59.2, "epoch": 0.7545454545454545, "grad_norm": 0.7298973202705383, "kl": 0.019743522815406322, "learning_rate": 8.593547261300715e-07, "loss": -0.06486127376556397, "memory(GiB)": 69.34, "reward": 0.8419650912284851, "reward_std": 0.4143169164657593, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.38124017119407655, "rewards/ReportKG_Jaccard/mean": 0.24196507334709166, "rewards/ReportKG_Jaccard/std": 0.0625507727265358, "step": 3735, "train_speed(iter/s)": 0.036893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 126.2, "completions/mean_length": 83.225, "completions/min_length": 62.8, "epoch": 0.7555555555555555, "grad_norm": 1.3554507493972778, "kl": 0.020669213309884072, "learning_rate": 8.528471272979083e-07, "loss": -0.002560793608427048, "memory(GiB)": 69.34, "reward": 0.7864658117294312, "reward_std": 0.4851405739784241, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.43348987102508546, "rewards/ReportKG_Jaccard/mean": 0.23646580874919892, "rewards/ReportKG_Jaccard/std": 0.09003267362713814, "step": 3740, "train_speed(iter/s)": 0.036893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 109.2, "completions/mean_length": 86.05, "completions/min_length": 67.6, "epoch": 0.7565656565656566, "grad_norm": 1.1173094511032104, "kl": 0.03175138644874096, "learning_rate": 8.463575781708055e-07, "loss": 0.03863131701946258, "memory(GiB)": 69.34, "reward": 0.9367511630058288, "reward_std": 0.4332002282142639, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.42594102025032043, "rewards/ReportKG_Jaccard/mean": 0.21175113320350647, "rewards/ReportKG_Jaccard/std": 0.049319709092378615, "step": 3745, "train_speed(iter/s)": 0.036889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 118.6, "completions/mean_length": 81.75, "completions/min_length": 57.4, "epoch": 0.7575757575757576, "grad_norm": 1.5512893199920654, "kl": 0.02639107033610344, "learning_rate": 8.398861808576039e-07, "loss": 0.057491248846054076, "memory(GiB)": 69.34, "reward": 0.8024991869926452, "reward_std": 0.5054236114025116, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.4478123545646667, "rewards/ReportKG_Jaccard/mean": 0.2524991750717163, "rewards/ReportKG_Jaccard/std": 0.08013003692030907, "step": 3750, "train_speed(iter/s)": 0.0369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.8, "completions/mean_length": 75.65, "completions/min_length": 57.2, "epoch": 0.7585858585858586, "grad_norm": 1.1839182376861572, "kl": 0.0513246551156044, "learning_rate": 8.334330371815345e-07, "loss": 0.044838476181030276, "memory(GiB)": 69.34, "reward": 0.9856595456600189, "reward_std": 0.2403162106871605, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.1776151716709137, "rewards/ReportKG_Jaccard/mean": 0.31065956354141233, "rewards/ReportKG_Jaccard/std": 0.07039609774947167, "step": 3755, "train_speed(iter/s)": 0.036909 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.8, "completions/mean_length": 82.35, "completions/min_length": 56.4, "epoch": 0.7595959595959596, "grad_norm": 0.9502052664756775, "kl": 0.0324993472546339, "learning_rate": 8.269982486786194e-07, "loss": 0.03105071187019348, "memory(GiB)": 69.34, "reward": 0.8975409030914306, "reward_std": 0.41419140100479124, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.38463483452796937, "rewards/ReportKG_Jaccard/mean": 0.17254092991352082, "rewards/ReportKG_Jaccard/std": 0.058782874047756194, "step": 3760, "train_speed(iter/s)": 0.036899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.8, "completions/mean_length": 77.525, "completions/min_length": 53.4, "epoch": 0.7606060606060606, "grad_norm": 1.1325968503952026, "kl": 0.03873908147215843, "learning_rate": 8.205819165960759e-07, "loss": 0.05545809268951416, "memory(GiB)": 69.34, "reward": 1.073724341392517, "reward_std": 0.36326110661029815, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.2237243488430977, "rewards/ReportKG_Jaccard/std": 0.08152787908911704, "step": 3765, "train_speed(iter/s)": 0.036902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.2, "completions/mean_length": 73.4, "completions/min_length": 56.6, "epoch": 0.7616161616161616, "grad_norm": 1.438603401184082, "kl": 0.019669535383582116, "learning_rate": 8.141841418907193e-07, "loss": 0.05182617902755737, "memory(GiB)": 69.34, "reward": 0.8610016256570816, "reward_std": 0.3206803023815155, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.2610016405582428, "rewards/ReportKG_Jaccard/std": 0.06882052570581436, "step": 3770, "train_speed(iter/s)": 0.036899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.4, "completions/mean_length": 69.725, "completions/min_length": 52.2, "epoch": 0.7626262626262627, "grad_norm": 0.9360145926475525, "kl": 0.0347586490213871, "learning_rate": 8.078050252273771e-07, "loss": 0.06007688641548157, "memory(GiB)": 69.34, "reward": 0.6551908314228058, "reward_std": 0.31699695289134977, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.1801908165216446, "rewards/ReportKG_Jaccard/std": 0.0562790859490633, "step": 3775, "train_speed(iter/s)": 0.03689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.2, "completions/mean_length": 70.375, "completions/min_length": 57.4, "epoch": 0.7636363636363637, "grad_norm": 0.9968180656433105, "kl": 0.055091821029782294, "learning_rate": 8.014446669773061e-07, "loss": 0.02228006273508072, "memory(GiB)": 69.34, "reward": 0.7458894610404968, "reward_std": 0.36513176262378694, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.17088944911956788, "rewards/ReportKG_Jaccard/std": 0.04106288850307464, "step": 3780, "train_speed(iter/s)": 0.036887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.2, "completions/mean_length": 76.725, "completions/min_length": 58.2, "epoch": 0.7646464646464647, "grad_norm": 1.6699955463409424, "kl": 0.022287565283477306, "learning_rate": 7.9510316721661e-07, "loss": -0.005520278215408325, "memory(GiB)": 69.34, "reward": 0.3601361870765686, "reward_std": 0.3304331585764885, "rewards/MultiModalAccuracyORM_Any/mean": 0.2, "rewards/MultiModalAccuracyORM_Any/std": 0.29206851720809934, "rewards/ReportKG_Jaccard/mean": 0.16013620644807816, "rewards/ReportKG_Jaccard/std": 0.0510417602956295, "step": 3785, "train_speed(iter/s)": 0.0369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/mean_length": 81.275, "completions/min_length": 59.6, "epoch": 0.7656565656565657, "grad_norm": 1.0828032493591309, "kl": 0.02382512018084526, "learning_rate": 7.88780625724667e-07, "loss": 0.03612504005432129, "memory(GiB)": 69.34, "reward": 0.943363618850708, "reward_std": 0.3773712545633316, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.31836357712745667, "rewards/ReportKG_Jaccard/std": 0.07308501228690148, "step": 3790, "train_speed(iter/s)": 0.036911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/mean_length": 74.5, "completions/min_length": 48.0, "epoch": 0.7666666666666667, "grad_norm": 1.1174166202545166, "kl": 0.02706191800534725, "learning_rate": 7.824771419825587e-07, "loss": 0.02147725373506546, "memory(GiB)": 69.34, "reward": 0.9530807733535767, "reward_std": 0.221068075299263, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.09258201122283935, "rewards/ReportKG_Jaccard/mean": 0.3030807912349701, "rewards/ReportKG_Jaccard/std": 0.1355612173676491, "step": 3795, "train_speed(iter/s)": 0.036917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.8, "completions/mean_length": 73.675, "completions/min_length": 56.6, "epoch": 0.7676767676767676, "grad_norm": 1.210686206817627, "kl": 0.02923839967697859, "learning_rate": 7.761928151715069e-07, "loss": 0.017580048739910127, "memory(GiB)": 69.34, "reward": 0.6857290714979172, "reward_std": 0.277160219848156, "rewards/MultiModalAccuracyORM_Any/mean": 0.475, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.21072907447814943, "rewards/ReportKG_Jaccard/std": 0.050663667544722554, "step": 3800, "train_speed(iter/s)": 0.036904 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 111.0, "completions/mean_length": 85.125, "completions/min_length": 66.4, "epoch": 0.7686868686868686, "grad_norm": 1.0946805477142334, "kl": 0.038452855125069615, "learning_rate": 7.699277441713104e-07, "loss": 0.05935021042823792, "memory(GiB)": 69.34, "reward": 1.0695377111434936, "reward_std": 0.1422286793589592, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.24453771114349365, "rewards/ReportKG_Jaccard/std": 0.07602076232433319, "step": 3805, "train_speed(iter/s)": 0.036893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.0, "completions/mean_length": 85.425, "completions/min_length": 66.2, "epoch": 0.7696969696969697, "grad_norm": 1.1448619365692139, "kl": 0.024545509740710257, "learning_rate": 7.636820275587894e-07, "loss": 0.04098070859909057, "memory(GiB)": 69.34, "reward": 1.0213856935501098, "reward_std": 0.42852507829666137, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.22138567119836808, "rewards/ReportKG_Jaccard/std": 0.0637198694050312, "step": 3810, "train_speed(iter/s)": 0.036892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.0, "completions/mean_length": 75.525, "completions/min_length": 51.8, "epoch": 0.7707070707070707, "grad_norm": 1.1887601613998413, "kl": 0.022513171285390855, "learning_rate": 7.57455763606238e-07, "loss": -0.0055600225925445555, "memory(GiB)": 69.34, "reward": 0.7002016812562942, "reward_std": 0.4181156039237976, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.3771016776561737, "rewards/ReportKG_Jaccard/mean": 0.1752016633749008, "rewards/ReportKG_Jaccard/std": 0.058961163461208346, "step": 3815, "train_speed(iter/s)": 0.036897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.8, "completions/mean_length": 67.4, "completions/min_length": 53.4, "epoch": 0.7717171717171717, "grad_norm": 1.6544716358184814, "kl": 0.025721624307334424, "learning_rate": 7.512490502798734e-07, "loss": -0.00848729833960533, "memory(GiB)": 69.34, "reward": 0.6014801263809204, "reward_std": 0.31805378049612043, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.25148014426231385, "rewards/ReportKG_Jaccard/std": 0.07445361316204072, "step": 3820, "train_speed(iter/s)": 0.036908 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.8, "completions/mean_length": 69.35, "completions/min_length": 48.4, "epoch": 0.7727272727272727, "grad_norm": 1.3115991353988647, "kl": 0.028247692808508872, "learning_rate": 7.450619852382958e-07, "loss": 0.018399390578269958, "memory(GiB)": 69.34, "reward": 0.6839487552642822, "reward_std": 0.2906422145664692, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.18394877314567565, "rewards/ReportKG_Jaccard/std": 0.07401911839842797, "step": 3825, "train_speed(iter/s)": 0.036901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 71.25, "completions/min_length": 53.0, "epoch": 0.7737373737373737, "grad_norm": 1.5020076036453247, "kl": 0.023968417756259443, "learning_rate": 7.388946658309556e-07, "loss": 3.692414611577988e-05, "memory(GiB)": 69.34, "reward": 0.6641532778739929, "reward_std": 0.4151178240776062, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.21415327191352845, "rewards/ReportKG_Jaccard/std": 0.07001645267009735, "step": 3830, "train_speed(iter/s)": 0.036916 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.8, "completions/mean_length": 72.4, "completions/min_length": 49.0, "epoch": 0.7747474747474747, "grad_norm": 1.2774525880813599, "kl": 0.025623257644474505, "learning_rate": 7.327471890966135e-07, "loss": 0.04223035573959351, "memory(GiB)": 69.34, "reward": 0.8965035259723664, "reward_std": 0.3447210147976875, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.2465035229921341, "rewards/ReportKG_Jaccard/std": 0.12731145471334457, "step": 3835, "train_speed(iter/s)": 0.036926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.2, "completions/mean_length": 80.075, "completions/min_length": 59.4, "epoch": 0.7757575757575758, "grad_norm": 1.6545991897583008, "kl": 0.02556097097694874, "learning_rate": 7.266196517618237e-07, "loss": 0.041469433903694154, "memory(GiB)": 69.34, "reward": 0.6668286800384522, "reward_std": 0.44537935256958006, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.1668286919593811, "rewards/ReportKG_Jaccard/std": 0.06582138873636723, "step": 3840, "train_speed(iter/s)": 0.036926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 125.4, "completions/mean_length": 94.675, "completions/min_length": 64.2, "epoch": 0.7767676767676768, "grad_norm": 1.3100042343139648, "kl": 0.029413596726953983, "learning_rate": 7.205121502394039e-07, "loss": -0.013711827993392944, "memory(GiB)": 69.34, "reward": 0.8138257384300231, "reward_std": 0.2597425267100334, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.2388257324695587, "rewards/ReportKG_Jaccard/std": 0.06012118756771088, "step": 3845, "train_speed(iter/s)": 0.036927 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 67.65, "completions/min_length": 50.2, "epoch": 0.7777777777777778, "grad_norm": 0.8284870386123657, "kl": 0.030942625552415847, "learning_rate": 7.144247806269213e-07, "loss": -0.015356317162513733, "memory(GiB)": 69.34, "reward": 0.8848501563072204, "reward_std": 0.4140437752008438, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.3812558650970459, "rewards/ReportKG_Jaccard/mean": 0.20985016524791716, "rewards/ReportKG_Jaccard/std": 0.05222611539065838, "step": 3850, "train_speed(iter/s)": 0.036928 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.0, "completions/mean_length": 62.95, "completions/min_length": 44.2, "epoch": 0.7787878787878788, "grad_norm": 1.2120542526245117, "kl": 0.021822476759552956, "learning_rate": 7.083576387051826e-07, "loss": -0.016609355807304382, "memory(GiB)": 69.34, "reward": 0.845516586303711, "reward_std": 0.4894894778728485, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.47382218241691587, "rewards/ReportKG_Jaccard/mean": 0.245516574382782, "rewards/ReportKG_Jaccard/std": 0.057713451236486434, "step": 3855, "train_speed(iter/s)": 0.036934 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.8, "completions/mean_length": 60.225, "completions/min_length": 47.0, "epoch": 0.7797979797979798, "grad_norm": 1.1128252744674683, "kl": 0.037563256174325946, "learning_rate": 7.023108199367234e-07, "loss": 0.010612136125564576, "memory(GiB)": 69.34, "reward": 0.6856259822845459, "reward_std": 0.29382962733507156, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.28562599420547485, "rewards/ReportKG_Jaccard/std": 0.08224406838417053, "step": 3860, "train_speed(iter/s)": 0.036943 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 70.5, "completions/min_length": 55.2, "epoch": 0.7808080808080808, "grad_norm": 1.1719111204147339, "kl": 0.05485452674329281, "learning_rate": 6.962844194643067e-07, "loss": 0.02304798811674118, "memory(GiB)": 69.34, "reward": 0.4516435980796814, "reward_std": 0.2931954815983772, "rewards/MultiModalAccuracyORM_Any/mean": 0.275, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.17664359658956527, "rewards/ReportKG_Jaccard/std": 0.05951798297464848, "step": 3865, "train_speed(iter/s)": 0.036953 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.0, "completions/mean_length": 71.05, "completions/min_length": 49.6, "epoch": 0.7818181818181819, "grad_norm": 1.670901894569397, "kl": 0.03106057532131672, "learning_rate": 6.9027853210943e-07, "loss": -0.03632751703262329, "memory(GiB)": 69.34, "reward": 0.8243884921073914, "reward_std": 0.41734063625335693, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.1743885025382042, "rewards/ReportKG_Jaccard/std": 0.05256110802292824, "step": 3870, "train_speed(iter/s)": 0.036958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.4, "completions/mean_length": 76.375, "completions/min_length": 56.6, "epoch": 0.7828282828282829, "grad_norm": 1.0024759769439697, "kl": 0.036007799208164215, "learning_rate": 6.842932523708274e-07, "loss": 0.002123086154460907, "memory(GiB)": 69.34, "reward": 0.7911971092224122, "reward_std": 0.5337030291557312, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.49493399262428284, "rewards/ReportKG_Jaccard/mean": 0.24119713604450227, "rewards/ReportKG_Jaccard/std": 0.08077076822519302, "step": 3875, "train_speed(iter/s)": 0.036964 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/mean_length": 73.975, "completions/min_length": 59.4, "epoch": 0.7838383838383839, "grad_norm": 1.126147985458374, "kl": 0.015386023465543986, "learning_rate": 6.783286744229864e-07, "loss": 0.032866114377975465, "memory(GiB)": 69.34, "reward": 0.981695544719696, "reward_std": 0.2449817806482315, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.35669557452201844, "rewards/ReportKG_Jaccard/std": 0.1140805423259735, "step": 3880, "train_speed(iter/s)": 0.036969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.8, "completions/mean_length": 86.25, "completions/min_length": 61.4, "epoch": 0.7848484848484848, "grad_norm": 1.1635810136795044, "kl": 0.022197040170431136, "learning_rate": 6.723848921146649e-07, "loss": -0.037906762957572934, "memory(GiB)": 69.34, "reward": 0.5585240364074707, "reward_std": 0.4358521357178688, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.3921836853027344, "rewards/ReportKG_Jaccard/mean": 0.10852405279874802, "rewards/ReportKG_Jaccard/std": 0.05737122669816017, "step": 3885, "train_speed(iter/s)": 0.036975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.0, "completions/mean_length": 71.6, "completions/min_length": 55.6, "epoch": 0.7858585858585858, "grad_norm": 0.8910573124885559, "kl": 0.03785783033818006, "learning_rate": 6.66461998967417e-07, "loss": -0.012635339796543122, "memory(GiB)": 69.34, "reward": 0.7930589199066163, "reward_std": 0.3307645499706268, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.21805891692638396, "rewards/ReportKG_Jaccard/std": 0.10341631323099136, "step": 3890, "train_speed(iter/s)": 0.036971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.6, "completions/mean_length": 75.525, "completions/min_length": 57.8, "epoch": 0.7868686868686868, "grad_norm": 1.2143648862838745, "kl": 0.019546537287533285, "learning_rate": 6.605600881741167e-07, "loss": 0.03168123662471771, "memory(GiB)": 69.34, "reward": 1.097390389442444, "reward_std": 0.2705255389213562, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.3223903626203537, "rewards/ReportKG_Jaccard/std": 0.07128342166543007, "step": 3895, "train_speed(iter/s)": 0.03696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.0, "completions/mean_length": 74.475, "completions/min_length": 52.0, "epoch": 0.7878787878787878, "grad_norm": 1.2969186305999756, "kl": 0.01877252198755741, "learning_rate": 6.546792525974949e-07, "loss": -0.022976918518543242, "memory(GiB)": 69.34, "reward": 0.5530344843864441, "reward_std": 0.3366776555776596, "rewards/MultiModalAccuracyORM_Any/mean": 0.325, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.22803449928760527, "rewards/ReportKG_Jaccard/std": 0.06662982329726219, "step": 3900, "train_speed(iter/s)": 0.036952 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.4, "completions/mean_length": 81.725, "completions/min_length": 65.0, "epoch": 0.7888888888888889, "grad_norm": 1.3109592199325562, "kl": 0.028002952970564366, "learning_rate": 6.488195847686795e-07, "loss": 0.014157035946846008, "memory(GiB)": 69.34, "reward": 0.7652148842811585, "reward_std": 0.24033747017383575, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.21521488726139068, "rewards/ReportKG_Jaccard/std": 0.08583886921405792, "step": 3905, "train_speed(iter/s)": 0.036937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.6, "completions/mean_length": 83.55, "completions/min_length": 54.8, "epoch": 0.7898989898989899, "grad_norm": 1.2386596202850342, "kl": 0.03319622091948986, "learning_rate": 6.429811768857358e-07, "loss": -0.05231530666351318, "memory(GiB)": 69.34, "reward": 0.802212655544281, "reward_std": 0.36056318432092666, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.15221265703439713, "rewards/ReportKG_Jaccard/std": 0.06306311003863811, "step": 3910, "train_speed(iter/s)": 0.036946 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.0, "completions/mean_length": 77.8, "completions/min_length": 57.8, "epoch": 0.7909090909090909, "grad_norm": 1.1012521982192993, "kl": 0.031722687371075156, "learning_rate": 6.371641208122175e-07, "loss": 0.04655675888061524, "memory(GiB)": 69.34, "reward": 0.9837505459785462, "reward_std": 0.3992059364914894, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.23375052958726883, "rewards/ReportKG_Jaccard/std": 0.07335549592971802, "step": 3915, "train_speed(iter/s)": 0.036958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.4, "completions/mean_length": 65.525, "completions/min_length": 52.0, "epoch": 0.7919191919191919, "grad_norm": 1.4160996675491333, "kl": 0.030162811651825903, "learning_rate": 6.313685080757234e-07, "loss": 0.021499058604240416, "memory(GiB)": 69.34, "reward": 1.2226340532302857, "reward_std": 0.3060896903276443, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.2104143261909485, "rewards/ReportKG_Jaccard/mean": 0.39763405323028567, "rewards/ReportKG_Jaccard/std": 0.1176774613559246, "step": 3920, "train_speed(iter/s)": 0.036954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.2, "completions/mean_length": 74.875, "completions/min_length": 57.2, "epoch": 0.7929292929292929, "grad_norm": 1.2346910238265991, "kl": 0.022191460803151132, "learning_rate": 6.255944298664535e-07, "loss": -0.022057197988033295, "memory(GiB)": 69.34, "reward": 0.5947577804327011, "reward_std": 0.4870454967021942, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.4772168457508087, "rewards/ReportKG_Jaccard/mean": 0.21975778341293334, "rewards/ReportKG_Jaccard/std": 0.03762353025376797, "step": 3925, "train_speed(iter/s)": 0.036968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.0, "completions/mean_length": 73.5, "completions/min_length": 54.0, "epoch": 0.793939393939394, "grad_norm": 1.2500361204147339, "kl": 0.0223866181448102, "learning_rate": 6.198419770357763e-07, "loss": 0.016417285799980162, "memory(GiB)": 69.34, "reward": 0.6450598120689393, "reward_std": 0.26052902489900587, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.2200598120689392, "rewards/ReportKG_Jaccard/std": 0.07023268416523934, "step": 3930, "train_speed(iter/s)": 0.036972 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.2, "completions/mean_length": 79.2, "completions/min_length": 52.8, "epoch": 0.794949494949495, "grad_norm": 1.1064364910125732, "kl": 0.03211029507219791, "learning_rate": 6.14111240094799e-07, "loss": -0.028964823484420775, "memory(GiB)": 69.34, "reward": 0.8852259039878845, "reward_std": 0.40735703706741333, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.3754247188568115, "rewards/ReportKG_Jaccard/mean": 0.18522590696811675, "rewards/ReportKG_Jaccard/std": 0.07187942489981651, "step": 3935, "train_speed(iter/s)": 0.036975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.8, "completions/mean_length": 77.675, "completions/min_length": 60.6, "epoch": 0.795959595959596, "grad_norm": 1.3598355054855347, "kl": 0.0271148145198822, "learning_rate": 6.084023092129447e-07, "loss": 0.011499100923538208, "memory(GiB)": 69.34, "reward": 0.8892297625541687, "reward_std": 0.35434018075466156, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.3190365254878998, "rewards/ReportKG_Jaccard/mean": 0.214229753613472, "rewards/ReportKG_Jaccard/std": 0.06221766695380211, "step": 3940, "train_speed(iter/s)": 0.036978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.8, "completions/mean_length": 80.55, "completions/min_length": 62.0, "epoch": 0.796969696969697, "grad_norm": 1.1586865186691284, "kl": 0.04900443106889725, "learning_rate": 6.027152742165309e-07, "loss": 0.01814490556716919, "memory(GiB)": 69.34, "reward": 1.085628366470337, "reward_std": 0.310374753177166, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.2856283634901047, "rewards/ReportKG_Jaccard/std": 0.06774450093507767, "step": 3945, "train_speed(iter/s)": 0.036991 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.6, "completions/mean_length": 66.95, "completions/min_length": 52.8, "epoch": 0.797979797979798, "grad_norm": 1.281064748764038, "kl": 0.04015895314514637, "learning_rate": 5.970502245873572e-07, "loss": 0.04105042815208435, "memory(GiB)": 69.34, "reward": 0.810858952999115, "reward_std": 0.49274582266807554, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4410387217998505, "rewards/ReportKG_Jaccard/mean": 0.23585897386074067, "rewards/ReportKG_Jaccard/std": 0.09359320998191833, "step": 3950, "train_speed(iter/s)": 0.037005 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/mean_length": 70.075, "completions/min_length": 53.8, "epoch": 0.798989898989899, "grad_norm": 1.336134433746338, "kl": 0.035530944168567655, "learning_rate": 5.914072494613005e-07, "loss": 0.04086272716522217, "memory(GiB)": 69.34, "reward": 1.172891104221344, "reward_std": 0.369794063270092, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.3228910803794861, "rewards/ReportKG_Jaccard/std": 0.08413084372878074, "step": 3955, "train_speed(iter/s)": 0.037019 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.8, "completions/mean_length": 69.55, "completions/min_length": 49.6, "epoch": 0.8, "grad_norm": 1.6057372093200684, "kl": 0.028897070325911045, "learning_rate": 5.857864376269051e-07, "loss": 0.01570589393377304, "memory(GiB)": 69.34, "reward": 0.7356778442859649, "reward_std": 0.2945392087101936, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.2106778621673584, "rewards/ReportKG_Jaccard/std": 0.07385391965508462, "step": 3960, "train_speed(iter/s)": 0.037028 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.0, "completions/mean_length": 81.65, "completions/min_length": 63.0, "epoch": 0.8010101010101011, "grad_norm": 1.0164680480957031, "kl": 0.015260051563382148, "learning_rate": 5.801878775239943e-07, "loss": -0.05797088146209717, "memory(GiB)": 69.34, "reward": 0.746462631225586, "reward_std": 0.34867037236690523, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.246462619304657, "rewards/ReportKG_Jaccard/std": 0.07659635171294213, "step": 3965, "train_speed(iter/s)": 0.037028 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.2, "completions/mean_length": 62.3, "completions/min_length": 46.6, "epoch": 0.802020202020202, "grad_norm": 1.1955512762069702, "kl": 0.0332628183066845, "learning_rate": 5.746116572422748e-07, "loss": -0.034235012531280515, "memory(GiB)": 69.34, "reward": 0.7887142270803451, "reward_std": 0.22357722371816635, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.1887142091989517, "rewards/ReportKG_Jaccard/std": 0.08770224377512932, "step": 3970, "train_speed(iter/s)": 0.037039 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.0, "completions/mean_length": 77.025, "completions/min_length": 53.6, "epoch": 0.803030303030303, "grad_norm": 1.391994595527649, "kl": 0.018601108342409134, "learning_rate": 5.690578645199469e-07, "loss": 0.011218472570180892, "memory(GiB)": 69.34, "reward": 0.5233962535858154, "reward_std": 0.3907885909080505, "rewards/MultiModalAccuracyORM_Any/mean": 0.35, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.17339624166488649, "rewards/ReportKG_Jaccard/std": 0.08833709731698036, "step": 3975, "train_speed(iter/s)": 0.037042 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.0, "completions/mean_length": 77.825, "completions/min_length": 53.8, "epoch": 0.804040404040404, "grad_norm": 1.0923479795455933, "kl": 0.02355923131108284, "learning_rate": 5.63526586742332e-07, "loss": 0.04549340903759003, "memory(GiB)": 69.34, "reward": 0.8941884279251099, "reward_std": 0.25704833716154096, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.19948650598526002, "rewards/ReportKG_Jaccard/mean": 0.2441884309053421, "rewards/ReportKG_Jaccard/std": 0.06761016622185707, "step": 3980, "train_speed(iter/s)": 0.037049 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.8, "completions/mean_length": 76.3, "completions/min_length": 57.2, "epoch": 0.805050505050505, "grad_norm": 1.1807693243026733, "kl": 0.026303902640938757, "learning_rate": 5.580179109404919e-07, "loss": 0.07350546717643738, "memory(GiB)": 69.34, "reward": 1.0110813856124878, "reward_std": 0.28035121113061906, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.19948650598526002, "rewards/ReportKG_Jaccard/mean": 0.2610813856124878, "rewards/ReportKG_Jaccard/std": 0.08584971092641354, "step": 3985, "train_speed(iter/s)": 0.037052 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.8, "completions/mean_length": 77.675, "completions/min_length": 54.2, "epoch": 0.806060606060606, "grad_norm": 1.5645925998687744, "kl": 0.023663947731256484, "learning_rate": 5.525319237898596e-07, "loss": 0.015539222955703735, "memory(GiB)": 69.34, "reward": 0.8395926713943481, "reward_std": 0.14009312763810158, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.21459270119667054, "rewards/ReportKG_Jaccard/std": 0.07236691787838936, "step": 3990, "train_speed(iter/s)": 0.037046 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 77.2, "completions/mean_length": 60.575, "completions/min_length": 47.8, "epoch": 0.807070707070707, "grad_norm": 1.2498141527175903, "kl": 0.0266856899484992, "learning_rate": 5.470687116088799e-07, "loss": 0.030566173791885375, "memory(GiB)": 69.34, "reward": 1.1851491928100586, "reward_std": 0.32877316772937776, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.3351492345333099, "rewards/ReportKG_Jaccard/std": 0.10299442522227764, "step": 3995, "train_speed(iter/s)": 0.037061 }, { "epoch": 0.8080808080808081, "grad_norm": 1.007249116897583, "learning_rate": 5.416283603576459e-07, "loss": 0.06728068590164185, "memory(GiB)": 69.34, "step": 4000, "train_speed(iter/s)": 0.037054 }, { "epoch": 0.8080808080808081, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 101.32, "eval_completions/mean_length": 77.7525, "eval_completions/min_length": 58.28, "eval_kl": 0.02917233444750309, "eval_loss": 0.021278241649270058, "eval_reward": 0.7638763774931431, "eval_reward_std": 0.30516436882317066, "eval_rewards/MultiModalAccuracyORM_Any/mean": 0.55, "eval_rewards/MultiModalAccuracyORM_Any/std": 0.26201597273349764, "eval_rewards/ReportKG_Jaccard/mean": 0.2138763752579689, "eval_rewards/ReportKG_Jaccard/std": 0.06754574183374644, "eval_runtime": 906.334, "eval_samples_per_second": 0.055, "eval_steps_per_second": 0.008, "step": 4000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.5, "completions/mean_length": 75.25, "completions/min_length": 52.0, "epoch": 0.8090909090909091, "grad_norm": 1.8946154117584229, "kl": 0.02323073549196124, "learning_rate": 5.362109556365495e-07, "loss": -0.05986601114273071, "memory(GiB)": 69.34, "reward": 0.6906467258930207, "reward_std": 0.40492683500051496, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3665457725524902, "rewards/ReportKG_Jaccard/mean": 0.19064673259854317, "rewards/ReportKG_Jaccard/std": 0.059483879059553144, "step": 4005, "train_speed(iter/s)": 0.036466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.6, "completions/mean_length": 75.225, "completions/min_length": 52.6, "epoch": 0.8101010101010101, "grad_norm": 0.9117390513420105, "kl": 0.020992421358823777, "learning_rate": 5.308165826849336e-07, "loss": 0.004149888455867767, "memory(GiB)": 69.34, "reward": 0.8674610257148743, "reward_std": 0.2863104686141014, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2674610078334808, "rewards/ReportKG_Jaccard/std": 0.07696851715445518, "step": 4010, "train_speed(iter/s)": 0.036467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.8, "completions/mean_length": 70.525, "completions/min_length": 49.8, "epoch": 0.8111111111111111, "grad_norm": 1.5018788576126099, "kl": 0.02760576829314232, "learning_rate": 5.25445326379752e-07, "loss": 0.05189487338066101, "memory(GiB)": 69.34, "reward": 1.0490003824234009, "reward_std": 0.36949819028377534, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.27400036454200744, "rewards/ReportKG_Jaccard/std": 0.06315787881612778, "step": 4015, "train_speed(iter/s)": 0.036466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/mean_length": 75.675, "completions/min_length": 59.2, "epoch": 0.8121212121212121, "grad_norm": 1.2683017253875732, "kl": 0.02794205565005541, "learning_rate": 5.200972712342326e-07, "loss": -0.006029093265533447, "memory(GiB)": 69.34, "reward": 1.180733871459961, "reward_std": 0.261529441177845, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.1632926881313324, "rewards/ReportKG_Jaccard/mean": 0.35573389530181887, "rewards/ReportKG_Jaccard/std": 0.11976018846035004, "step": 4020, "train_speed(iter/s)": 0.036467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.8, "completions/mean_length": 74.65, "completions/min_length": 49.2, "epoch": 0.8131313131313131, "grad_norm": 1.3683421611785889, "kl": 0.02087901495397091, "learning_rate": 5.147725013965474e-07, "loss": 0.030611172318458557, "memory(GiB)": 69.34, "reward": 0.5840782523155212, "reward_std": 0.4579583376646042, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.41743398904800416, "rewards/ReportKG_Jaccard/mean": 0.20907825827598572, "rewards/ReportKG_Jaccard/std": 0.07008587121963501, "step": 4025, "train_speed(iter/s)": 0.036466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.6, "completions/mean_length": 67.6, "completions/min_length": 52.4, "epoch": 0.8141414141414142, "grad_norm": 1.2293387651443481, "kl": 0.03920802418142557, "learning_rate": 5.094711006484907e-07, "loss": 0.05704864263534546, "memory(GiB)": 69.34, "reward": 0.7648953437805176, "reward_std": 0.5234077751636506, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.47382218241691587, "rewards/ReportKG_Jaccard/mean": 0.2148953467607498, "rewards/ReportKG_Jaccard/std": 0.06327410750091075, "step": 4030, "train_speed(iter/s)": 0.036475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.6, "completions/mean_length": 62.825, "completions/min_length": 48.8, "epoch": 0.8151515151515152, "grad_norm": 1.1103583574295044, "kl": 0.04509884044528008, "learning_rate": 5.041931524041583e-07, "loss": 0.03742363452911377, "memory(GiB)": 69.34, "reward": 0.8123525142669678, "reward_std": 0.4151559978723526, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.1623525246977806, "rewards/ReportKG_Jaccard/std": 0.08112010285258293, "step": 4035, "train_speed(iter/s)": 0.036467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/mean_length": 76.3, "completions/min_length": 55.8, "epoch": 0.8161616161616162, "grad_norm": 1.0897561311721802, "kl": 0.055476114153862, "learning_rate": 4.989387397086357e-07, "loss": 0.06687958836555481, "memory(GiB)": 69.34, "reward": 1.159949278831482, "reward_std": 0.2909981310367584, "rewards/MultiModalAccuracyORM_Any/mean": 0.9, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2599493369460106, "rewards/ReportKG_Jaccard/std": 0.08110946118831634, "step": 4040, "train_speed(iter/s)": 0.036475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 67.425, "completions/min_length": 49.8, "epoch": 0.8171717171717172, "grad_norm": 0.8920333385467529, "kl": 0.03487493097782135, "learning_rate": 4.937079452366935e-07, "loss": -0.005653520673513412, "memory(GiB)": 69.34, "reward": 1.002722942829132, "reward_std": 0.3728410840034485, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.30299633741378784, "rewards/ReportKG_Jaccard/mean": 0.2777229309082031, "rewards/ReportKG_Jaccard/std": 0.09333709329366684, "step": 4045, "train_speed(iter/s)": 0.036475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.0, "completions/mean_length": 75.6, "completions/min_length": 58.0, "epoch": 0.8181818181818182, "grad_norm": 1.0596038103103638, "kl": 0.05605616383254528, "learning_rate": 4.885008512914837e-07, "loss": 0.008870096504688263, "memory(GiB)": 69.34, "reward": 0.7985017418861389, "reward_std": 0.3651298493146896, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.17350172773003578, "rewards/ReportKG_Jaccard/std": 0.04614068642258644, "step": 4050, "train_speed(iter/s)": 0.036464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.6, "completions/mean_length": 72.05, "completions/min_length": 56.4, "epoch": 0.8191919191919191, "grad_norm": 1.3281655311584473, "kl": 0.03291135374456644, "learning_rate": 4.833175398032457e-07, "loss": 0.022888115048408507, "memory(GiB)": 69.34, "reward": 0.6687588930130005, "reward_std": 0.3562822639942169, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.16875889301300048, "rewards/ReportKG_Jaccard/std": 0.04784134700894356, "step": 4055, "train_speed(iter/s)": 0.03648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.4, "completions/mean_length": 77.25, "completions/min_length": 59.8, "epoch": 0.8202020202020202, "grad_norm": 1.0915348529815674, "kl": 0.035398335009813306, "learning_rate": 4.781580923280171e-07, "loss": -0.028486189246177674, "memory(GiB)": 69.34, "reward": 0.945030128955841, "reward_std": 0.38245194852352143, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.19503015279769897, "rewards/ReportKG_Jaccard/std": 0.047038056328892706, "step": 4060, "train_speed(iter/s)": 0.036469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 95.6, "completions/mean_length": 73.125, "completions/min_length": 53.8, "epoch": 0.8212121212121212, "grad_norm": 1.2245209217071533, "kl": 0.04139171503484249, "learning_rate": 4.7302259004635293e-07, "loss": 0.061766958236694335, "memory(GiB)": 69.34, "reward": 1.0049607753753662, "reward_std": 0.44370766878128054, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.3897472023963928, "rewards/ReportKG_Jaccard/mean": 0.20496079623699187, "rewards/ReportKG_Jaccard/std": 0.12202444449067115, "step": 4065, "train_speed(iter/s)": 0.036486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.4, "completions/mean_length": 69.0, "completions/min_length": 52.6, "epoch": 0.8222222222222222, "grad_norm": 1.3467744588851929, "kl": 0.03315945062786341, "learning_rate": 4.679111137620442e-07, "loss": 0.029103899002075197, "memory(GiB)": 69.34, "reward": 1.0171638607978821, "reward_std": 0.33879479616880415, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.24216386675834656, "rewards/ReportKG_Jaccard/std": 0.06511048823595048, "step": 4070, "train_speed(iter/s)": 0.036493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.4, "completions/mean_length": 79.275, "completions/min_length": 59.8, "epoch": 0.8232323232323232, "grad_norm": 1.3363276720046997, "kl": 0.0211261585354805, "learning_rate": 4.62823743900848e-07, "loss": 0.018029353022575377, "memory(GiB)": 69.34, "reward": 0.813470995426178, "reward_std": 0.3921751990914345, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.2884709745645523, "rewards/ReportKG_Jaccard/std": 0.09158326163887978, "step": 4075, "train_speed(iter/s)": 0.036495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.6, "completions/mean_length": 69.025, "completions/min_length": 53.2, "epoch": 0.8242424242424242, "grad_norm": 0.9583812952041626, "kl": 0.04004216678440571, "learning_rate": 4.5776056050922473e-07, "loss": 0.049049532413482665, "memory(GiB)": 69.34, "reward": 1.134187912940979, "reward_std": 0.28968686163425444, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.259187912940979, "rewards/ReportKG_Jaccard/std": 0.06556373313069344, "step": 4080, "train_speed(iter/s)": 0.036501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.8, "completions/mean_length": 69.4, "completions/min_length": 53.6, "epoch": 0.8252525252525252, "grad_norm": 0.8808609247207642, "kl": 0.030544058978557588, "learning_rate": 4.5272164325307473e-07, "loss": 0.029641860723495485, "memory(GiB)": 69.34, "reward": 0.7954851031303406, "reward_std": 0.3146233156323433, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.24548510909080506, "rewards/ReportKG_Jaccard/std": 0.06407725512981415, "step": 4085, "train_speed(iter/s)": 0.036514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.0, "completions/mean_length": 72.625, "completions/min_length": 57.2, "epoch": 0.8262626262626263, "grad_norm": 1.3832319974899292, "kl": 0.04948402419686317, "learning_rate": 4.477070714164861e-07, "loss": 0.030134755373001098, "memory(GiB)": 69.34, "reward": 0.7604659169912338, "reward_std": 0.3749727725982666, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.3518356800079346, "rewards/ReportKG_Jaccard/mean": 0.18546591699123383, "rewards/ReportKG_Jaccard/std": 0.06478834114968776, "step": 4090, "train_speed(iter/s)": 0.036521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 102.2, "completions/mean_length": 71.25, "completions/min_length": 52.4, "epoch": 0.8272727272727273, "grad_norm": 1.376325249671936, "kl": 0.04228933975100517, "learning_rate": 4.427169239004902e-07, "loss": 0.021941661834716797, "memory(GiB)": 69.34, "reward": 0.9018104434013366, "reward_std": 0.33688838928937914, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.22681044638156891, "rewards/ReportKG_Jaccard/std": 0.08679043650627136, "step": 4095, "train_speed(iter/s)": 0.036524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 105.4, "completions/mean_length": 81.05, "completions/min_length": 61.0, "epoch": 0.8282828282828283, "grad_norm": 1.506461262702942, "kl": 0.030569331906735896, "learning_rate": 4.377512792218132e-07, "loss": 0.07415076494216918, "memory(GiB)": 69.34, "reward": 0.6301932990550995, "reward_std": 0.2319528914988041, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.20519327521324157, "rewards/ReportKG_Jaccard/std": 0.06793593913316727, "step": 4100, "train_speed(iter/s)": 0.036534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/mean_length": 73.0, "completions/min_length": 62.6, "epoch": 0.8292929292929293, "grad_norm": 1.6137560606002808, "kl": 0.0383821677416563, "learning_rate": 4.328102155116495e-07, "loss": 0.010371389985084533, "memory(GiB)": 69.34, "reward": 1.073478639125824, "reward_std": 0.3196213811635971, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.22347867488861084, "rewards/ReportKG_Jaccard/std": 0.06280415840446948, "step": 4105, "train_speed(iter/s)": 0.036533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.8, "completions/mean_length": 81.3, "completions/min_length": 61.6, "epoch": 0.8303030303030303, "grad_norm": 1.491201639175415, "kl": 0.02958683855831623, "learning_rate": 4.278938105144254e-07, "loss": -0.014492490887641906, "memory(GiB)": 69.34, "reward": 0.828744712471962, "reward_std": 0.26242690682411196, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.25374471247196195, "rewards/ReportKG_Jaccard/std": 0.07535031698644161, "step": 4110, "train_speed(iter/s)": 0.036534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.4, "completions/mean_length": 70.025, "completions/min_length": 48.8, "epoch": 0.8313131313131313, "grad_norm": 1.3027046918869019, "kl": 0.02232372760772705, "learning_rate": 4.230021415865785e-07, "loss": -0.014129695296287537, "memory(GiB)": 69.34, "reward": 0.5727395415306091, "reward_std": 0.3314061537384987, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.19773952960968016, "rewards/ReportKG_Jaccard/std": 0.0703151673078537, "step": 4115, "train_speed(iter/s)": 0.036533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.0, "completions/mean_length": 86.625, "completions/min_length": 59.6, "epoch": 0.8323232323232324, "grad_norm": 1.081192135810852, "kl": 0.040833380818367, "learning_rate": 4.1813528569534174e-07, "loss": -0.06206895709037781, "memory(GiB)": 69.34, "reward": 0.7438997387886047, "reward_std": 0.4383934736251831, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.16889974251389503, "rewards/ReportKG_Jaccard/std": 0.04906768500804901, "step": 4120, "train_speed(iter/s)": 0.036541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 73.8, "completions/mean_length": 63.7, "completions/min_length": 51.0, "epoch": 0.8333333333333334, "grad_norm": 1.146206259727478, "kl": 0.02839268036186695, "learning_rate": 4.132933194175299e-07, "loss": 0.06630042791366578, "memory(GiB)": 69.34, "reward": 0.9812798976898194, "reward_std": 0.4582886561751366, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.3846505284309387, "rewards/ReportKG_Jaccard/mean": 0.33127992153167723, "rewards/ReportKG_Jaccard/std": 0.10075798332691192, "step": 4125, "train_speed(iter/s)": 0.036547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.6, "completions/mean_length": 66.975, "completions/min_length": 51.4, "epoch": 0.8343434343434344, "grad_norm": 1.7362251281738281, "kl": 0.0375378955155611, "learning_rate": 4.084763189383356e-07, "loss": -0.020507438480854033, "memory(GiB)": 69.34, "reward": 0.7557844176888466, "reward_std": 0.2576370909810066, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.20701966285705567, "rewards/ReportKG_Jaccard/mean": 0.15578440874814986, "rewards/ReportKG_Jaccard/std": 0.05858471468091011, "step": 4130, "train_speed(iter/s)": 0.036561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 108.4, "completions/mean_length": 73.875, "completions/min_length": 51.8, "epoch": 0.8353535353535354, "grad_norm": 1.0672725439071655, "kl": 0.028233875147998334, "learning_rate": 4.036843600501325e-07, "loss": 0.041341930627822876, "memory(GiB)": 69.34, "reward": 1.0182014226913452, "reward_std": 0.34932819455862046, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.1682014636695385, "rewards/ReportKG_Jaccard/std": 0.05394093655049801, "step": 4135, "train_speed(iter/s)": 0.036568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.4, "completions/mean_length": 70.875, "completions/min_length": 54.2, "epoch": 0.8363636363636363, "grad_norm": 1.1001875400543213, "kl": 0.03198786452412605, "learning_rate": 3.9891751815127937e-07, "loss": 0.1058552622795105, "memory(GiB)": 69.34, "reward": 1.2778087615966798, "reward_std": 0.32894624024629593, "rewards/MultiModalAccuracyORM_Any/mean": 0.925, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.3528087496757507, "rewards/ReportKG_Jaccard/std": 0.14319533929228784, "step": 4140, "train_speed(iter/s)": 0.036565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.2, "completions/mean_length": 71.375, "completions/min_length": 56.2, "epoch": 0.8373737373737373, "grad_norm": 0.9431859254837036, "kl": 0.02961362637579441, "learning_rate": 3.941758682449363e-07, "loss": 0.03429582715034485, "memory(GiB)": 69.34, "reward": 1.0211557507514955, "reward_std": 0.2918805435299873, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2711557298898697, "rewards/ReportKG_Jaccard/std": 0.08677085041999817, "step": 4145, "train_speed(iter/s)": 0.036567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.6, "completions/mean_length": 73.8, "completions/min_length": 59.6, "epoch": 0.8383838383838383, "grad_norm": 1.3088639974594116, "kl": 0.036689478904008865, "learning_rate": 3.894594849378827e-07, "loss": 0.012331856787204743, "memory(GiB)": 69.34, "reward": 0.8458573579788208, "reward_std": 0.5017991900444031, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.4587401747703552, "rewards/ReportKG_Jaccard/mean": 0.22085736989974974, "rewards/ReportKG_Jaccard/std": 0.07112784534692765, "step": 4150, "train_speed(iter/s)": 0.036573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 97.8, "completions/mean_length": 72.325, "completions/min_length": 57.2, "epoch": 0.8393939393939394, "grad_norm": 1.0338283777236938, "kl": 0.04490675600245595, "learning_rate": 3.847684424393469e-07, "loss": 0.05136773586273193, "memory(GiB)": 69.34, "reward": 0.7533246994018554, "reward_std": 0.28360954150557516, "rewards/MultiModalAccuracyORM_Any/mean": 0.45, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.3033247008919716, "rewards/ReportKG_Jaccard/std": 0.07202451080083846, "step": 4155, "train_speed(iter/s)": 0.036577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 69.7, "completions/min_length": 52.2, "epoch": 0.8404040404040404, "grad_norm": 1.1140978336334229, "kl": 0.03881627842783928, "learning_rate": 3.801028145598335e-07, "loss": 0.08809124231338501, "memory(GiB)": 69.34, "reward": 0.9659388720989227, "reward_std": 0.34538788050413133, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.24832584857940673, "rewards/ReportKG_Jaccard/mean": 0.26593888700008395, "rewards/ReportKG_Jaccard/std": 0.11309503056108952, "step": 4160, "train_speed(iter/s)": 0.036581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.0, "completions/mean_length": 71.35, "completions/min_length": 54.2, "epoch": 0.8414141414141414, "grad_norm": 0.8106781840324402, "kl": 0.027228619903326035, "learning_rate": 3.754626747099652e-07, "loss": 0.02835933268070221, "memory(GiB)": 69.34, "reward": 0.898475730419159, "reward_std": 0.3220705732703209, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.17347571700811387, "rewards/ReportKG_Jaccard/std": 0.04982728809118271, "step": 4165, "train_speed(iter/s)": 0.036591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.8, "completions/mean_length": 70.275, "completions/min_length": 50.8, "epoch": 0.8424242424242424, "grad_norm": 0.977986752986908, "kl": 0.031642910279333594, "learning_rate": 3.7084809589932854e-07, "loss": 0.008070851862430572, "memory(GiB)": 69.34, "reward": 0.7445592105388641, "reward_std": 0.3766497239470482, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.24455921053886415, "rewards/ReportKG_Jaccard/std": 0.0823230005800724, "step": 4170, "train_speed(iter/s)": 0.036596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.8, "completions/mean_length": 70.05, "completions/min_length": 52.4, "epoch": 0.8434343434343434, "grad_norm": 1.5254589319229126, "kl": 0.027080820128321647, "learning_rate": 3.662591507353223e-07, "loss": 0.07174933552742005, "memory(GiB)": 69.34, "reward": 1.0065733432769775, "reward_std": 0.33840263783931734, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.2845196664333344, "rewards/ReportKG_Jaccard/mean": 0.23157334327697754, "rewards/ReportKG_Jaccard/std": 0.06865864396095275, "step": 4175, "train_speed(iter/s)": 0.036601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 69.95, "completions/min_length": 52.4, "epoch": 0.8444444444444444, "grad_norm": 1.821381688117981, "kl": 0.030847423523664475, "learning_rate": 3.6169591142201617e-07, "loss": 0.057618331909179685, "memory(GiB)": 69.34, "reward": 0.9146918892860413, "reward_std": 0.388785283267498, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.2886738538742065, "rewards/ReportKG_Jaccard/mean": 0.2896919120103121, "rewards/ReportKG_Jaccard/std": 0.11297612562775612, "step": 4180, "train_speed(iter/s)": 0.036604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 110.6, "completions/mean_length": 79.875, "completions/min_length": 59.8, "epoch": 0.8454545454545455, "grad_norm": 1.317879319190979, "kl": 0.02555729541927576, "learning_rate": 3.571584497590174e-07, "loss": -0.060777926445007326, "memory(GiB)": 69.34, "reward": 0.6894265115261078, "reward_std": 0.38502724170684816, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3897472023963928, "rewards/ReportKG_Jaccard/mean": 0.18942652344703675, "rewards/ReportKG_Jaccard/std": 0.04160146098583937, "step": 4185, "train_speed(iter/s)": 0.036593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 73.5, "completions/min_length": 52.2, "epoch": 0.8464646464646465, "grad_norm": 1.398066520690918, "kl": 0.035487842932343486, "learning_rate": 3.526468371403346e-07, "loss": -0.023683851957321166, "memory(GiB)": 69.34, "reward": 0.8416564017534256, "reward_std": 0.13598784282803536, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.2666564077138901, "rewards/ReportKG_Jaccard/std": 0.10416927412152291, "step": 4190, "train_speed(iter/s)": 0.036572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.4, "completions/mean_length": 76.475, "completions/min_length": 53.8, "epoch": 0.8474747474747475, "grad_norm": 1.3434432744979858, "kl": 0.03178669549524784, "learning_rate": 3.481611445532624e-07, "loss": 0.05773537158966065, "memory(GiB)": 69.34, "reward": 0.874362587928772, "reward_std": 0.29283888787031176, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.19936258792877198, "rewards/ReportKG_Jaccard/std": 0.06492239497601986, "step": 4195, "train_speed(iter/s)": 0.036577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.8, "completions/mean_length": 70.075, "completions/min_length": 53.0, "epoch": 0.8484848484848485, "grad_norm": 1.1623928546905518, "kl": 0.03061071187257767, "learning_rate": 3.4370144257725865e-07, "loss": 0.03830806314945221, "memory(GiB)": 69.34, "reward": 0.8498185127973557, "reward_std": 0.3473422646522522, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.2248185396194458, "rewards/ReportKG_Jaccard/std": 0.06580631583929061, "step": 4200, "train_speed(iter/s)": 0.036591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.6, "completions/mean_length": 74.0, "completions/min_length": 60.2, "epoch": 0.8494949494949495, "grad_norm": 0.9681943655014038, "kl": 0.024079494178295135, "learning_rate": 3.392678013828356e-07, "loss": -0.031463822722434996, "memory(GiB)": 69.34, "reward": 0.9399202346801758, "reward_std": 0.17916345447301865, "rewards/MultiModalAccuracyORM_Any/mean": 0.725, "rewards/MultiModalAccuracyORM_Any/std": 0.10350983142852783, "rewards/ReportKG_Jaccard/mean": 0.21492022275924683, "rewards/ReportKG_Jaccard/std": 0.07684798017144204, "step": 4205, "train_speed(iter/s)": 0.036597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.8, "completions/mean_length": 70.325, "completions/min_length": 56.2, "epoch": 0.8505050505050505, "grad_norm": 0.741170346736908, "kl": 0.01859814766794443, "learning_rate": 3.348602907304574e-07, "loss": -0.027637094259262085, "memory(GiB)": 69.34, "reward": 0.8984657883644104, "reward_std": 0.2684148222208023, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.2449311852455139, "rewards/ReportKG_Jaccard/mean": 0.27346581816673277, "rewards/ReportKG_Jaccard/std": 0.07267598360776902, "step": 4210, "train_speed(iter/s)": 0.036604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.2, "completions/mean_length": 79.975, "completions/min_length": 55.0, "epoch": 0.8515151515151516, "grad_norm": 1.0187606811523438, "kl": 0.02181231454014778, "learning_rate": 3.304789799694394e-07, "loss": -0.002716157212853432, "memory(GiB)": 69.34, "reward": 0.6891131117939949, "reward_std": 0.3768241986632347, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3409078598022461, "rewards/ReportKG_Jaccard/mean": 0.1891131043434143, "rewards/ReportKG_Jaccard/std": 0.06482217088341713, "step": 4215, "train_speed(iter/s)": 0.036609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.2, "completions/mean_length": 78.6, "completions/min_length": 57.8, "epoch": 0.8525252525252526, "grad_norm": 1.1604896783828735, "kl": 0.01457678321748972, "learning_rate": 3.2612393803686055e-07, "loss": -0.004432182013988495, "memory(GiB)": 69.34, "reward": 0.9128745913505554, "reward_std": 0.4081773653626442, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.36277919411659243, "rewards/ReportKG_Jaccard/mean": 0.23787456452846528, "rewards/ReportKG_Jaccard/std": 0.05371894910931587, "step": 4220, "train_speed(iter/s)": 0.036618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.2, "completions/mean_length": 65.95, "completions/min_length": 47.4, "epoch": 0.8535353535353535, "grad_norm": 1.107249140739441, "kl": 0.048651008307933806, "learning_rate": 3.2179523345647553e-07, "loss": 0.012902182340621949, "memory(GiB)": 69.34, "reward": 0.9943760633468628, "reward_std": 0.38258619904518126, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.19437606632709503, "rewards/ReportKG_Jaccard/std": 0.09984674081206321, "step": 4225, "train_speed(iter/s)": 0.036631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 111.6, "completions/mean_length": 84.05, "completions/min_length": 57.4, "epoch": 0.8545454545454545, "grad_norm": 1.0959552526474, "kl": 0.02821904979646206, "learning_rate": 3.1749293433763736e-07, "loss": -0.06532486081123352, "memory(GiB)": 69.34, "reward": 0.7430677950382233, "reward_std": 0.3862087965011597, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.3535533845424652, "rewards/ReportKG_Jaccard/mean": 0.16806779503822328, "rewards/ReportKG_Jaccard/std": 0.06388941183686256, "step": 4230, "train_speed(iter/s)": 0.036636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.6, "completions/mean_length": 68.875, "completions/min_length": 48.4, "epoch": 0.8555555555555555, "grad_norm": 1.1109626293182373, "kl": 0.0434935450553894, "learning_rate": 3.13217108374229e-07, "loss": 0.08502500653266906, "memory(GiB)": 69.34, "reward": 1.1290700912475586, "reward_std": 0.30355794429779054, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.2540701121091843, "rewards/ReportKG_Jaccard/std": 0.07555176168680192, "step": 4235, "train_speed(iter/s)": 0.036648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 115.2, "completions/mean_length": 83.0, "completions/min_length": 62.8, "epoch": 0.8565656565656565, "grad_norm": 1.38374924659729, "kl": 0.018616877123713495, "learning_rate": 3.089678228435915e-07, "loss": -0.013027088344097137, "memory(GiB)": 69.34, "reward": 0.7725218325853348, "reward_std": 0.39615253508090975, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.2225218653678894, "rewards/ReportKG_Jaccard/std": 0.07170141860842705, "step": 4240, "train_speed(iter/s)": 0.036648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 78.6, "completions/mean_length": 66.275, "completions/min_length": 50.4, "epoch": 0.8575757575757575, "grad_norm": 1.2667236328125, "kl": 0.023515591956675054, "learning_rate": 3.0474514460547295e-07, "loss": -0.017603325843811034, "memory(GiB)": 69.34, "reward": 0.9508451700210572, "reward_std": 0.476545113325119, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.43009520769119264, "rewards/ReportKG_Jaccard/mean": 0.2758451595902443, "rewards/ReportKG_Jaccard/std": 0.06963934488594532, "step": 4245, "train_speed(iter/s)": 0.036656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.6, "completions/mean_length": 81.4, "completions/min_length": 58.0, "epoch": 0.8585858585858586, "grad_norm": 1.962043046951294, "kl": 0.030660936236381532, "learning_rate": 3.0054914010097144e-07, "loss": 0.007987764477729798, "memory(GiB)": 69.34, "reward": 0.9033862054347992, "reward_std": 0.3517624393105507, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.29206851720809934, "rewards/ReportKG_Jaccard/mean": 0.20338622927665712, "rewards/ReportKG_Jaccard/std": 0.07562801167368889, "step": 4250, "train_speed(iter/s)": 0.036651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 87.8, "completions/mean_length": 70.35, "completions/min_length": 57.0, "epoch": 0.8595959595959596, "grad_norm": 1.0076560974121094, "kl": 0.034910655580461025, "learning_rate": 2.96379875351491e-07, "loss": 0.05581285357475281, "memory(GiB)": 69.34, "reward": 1.0092128098011017, "reward_std": 0.28642177730798724, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.30921280980110166, "rewards/ReportKG_Jaccard/std": 0.07304671034216881, "step": 4255, "train_speed(iter/s)": 0.036666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.2, "completions/mean_length": 74.625, "completions/min_length": 57.0, "epoch": 0.8606060606060606, "grad_norm": 1.531592607498169, "kl": 0.02139020599424839, "learning_rate": 2.922374159577039e-07, "loss": -0.017703549563884736, "memory(GiB)": 69.34, "reward": 0.7977461636066436, "reward_std": 0.4907503604888916, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.4410230278968811, "rewards/ReportKG_Jaccard/mean": 0.24774616360664367, "rewards/ReportKG_Jaccard/std": 0.07541293576359749, "step": 4260, "train_speed(iter/s)": 0.036668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.6, "completions/mean_length": 87.7, "completions/min_length": 66.6, "epoch": 0.8616161616161616, "grad_norm": 1.145945429801941, "kl": 0.022454402595758437, "learning_rate": 2.8812182709851684e-07, "loss": -0.005484357476234436, "memory(GiB)": 69.34, "reward": 0.7746980786323547, "reward_std": 0.32847292721271515, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.27019718289375305, "rewards/ReportKG_Jaccard/mean": 0.24969805777072906, "rewards/ReportKG_Jaccard/std": 0.07448726743459702, "step": 4265, "train_speed(iter/s)": 0.036674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 77.225, "completions/min_length": 58.0, "epoch": 0.8626262626262626, "grad_norm": 1.9155248403549194, "kl": 0.021233450435101987, "learning_rate": 2.8403317353004585e-07, "loss": 0.038179400563240054, "memory(GiB)": 69.34, "reward": 0.7906251966953277, "reward_std": 0.32940230816602706, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.27773033976554873, "rewards/ReportKG_Jaccard/mean": 0.21562518775463105, "rewards/ReportKG_Jaccard/std": 0.06615647077560424, "step": 4270, "train_speed(iter/s)": 0.036659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 103.4, "completions/mean_length": 75.8, "completions/min_length": 58.4, "epoch": 0.8636363636363636, "grad_norm": 1.0998568534851074, "kl": 0.02991343103349209, "learning_rate": 2.7997151958459884e-07, "loss": 0.023359456658363344, "memory(GiB)": 69.34, "reward": 0.9818273663520813, "reward_std": 0.3319572687149048, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.29206851720809934, "rewards/ReportKG_Jaccard/mean": 0.18182736039161682, "rewards/ReportKG_Jaccard/std": 0.06205914467573166, "step": 4275, "train_speed(iter/s)": 0.036666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.6, "completions/mean_length": 67.075, "completions/min_length": 52.2, "epoch": 0.8646464646464647, "grad_norm": 1.0357333421707153, "kl": 0.03268580343574286, "learning_rate": 2.759369291696614e-07, "loss": 0.03106358051300049, "memory(GiB)": 69.34, "reward": 0.9689328908920288, "reward_std": 0.4602164447307587, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.4082238733768463, "rewards/ReportKG_Jaccard/mean": 0.3189328908920288, "rewards/ReportKG_Jaccard/std": 0.0994376339018345, "step": 4280, "train_speed(iter/s)": 0.036662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.2, "completions/mean_length": 76.65, "completions/min_length": 60.4, "epoch": 0.8656565656565657, "grad_norm": 1.344506025314331, "kl": 0.02456991784274578, "learning_rate": 2.7192946576689203e-07, "loss": -0.02588338255882263, "memory(GiB)": 69.34, "reward": 0.8487532079219818, "reward_std": 0.27785519734025, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.2487531930208206, "rewards/ReportKG_Jaccard/std": 0.0694795474410057, "step": 4285, "train_speed(iter/s)": 0.036664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 94.8, "completions/mean_length": 71.675, "completions/min_length": 55.8, "epoch": 0.8666666666666667, "grad_norm": 1.5487018823623657, "kl": 0.04035500679165125, "learning_rate": 2.6794919243112256e-07, "loss": 0.08711923956871033, "memory(GiB)": 69.34, "reward": 0.8165765881538392, "reward_std": 0.36191724687814714, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.2665765851736069, "rewards/ReportKG_Jaccard/std": 0.07377565130591393, "step": 4290, "train_speed(iter/s)": 0.036646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 106.8, "completions/mean_length": 77.675, "completions/min_length": 54.4, "epoch": 0.8676767676767677, "grad_norm": 1.199105143547058, "kl": 0.020941758714616298, "learning_rate": 2.639961717893691e-07, "loss": 0.06262814998626709, "memory(GiB)": 69.34, "reward": 0.41571204364299774, "reward_std": 0.35071487426757814, "rewards/MultiModalAccuracyORM_Any/mean": 0.25, "rewards/MultiModalAccuracyORM_Any/std": 0.30639100074768066, "rewards/ReportKG_Jaccard/mean": 0.16571203768253326, "rewards/ReportKG_Jaccard/std": 0.06383172273635865, "step": 4295, "train_speed(iter/s)": 0.036645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 65.575, "completions/min_length": 44.8, "epoch": 0.8686868686868687, "grad_norm": 1.595555305480957, "kl": 0.027385135367512702, "learning_rate": 2.6007046603984207e-07, "loss": -0.044563084840774536, "memory(GiB)": 69.34, "reward": 0.7470289558172226, "reward_std": 0.4654842436313629, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.4444176912307739, "rewards/ReportKG_Jaccard/mean": 0.22202896177768708, "rewards/ReportKG_Jaccard/std": 0.0572405744343996, "step": 4300, "train_speed(iter/s)": 0.036645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.2, "completions/mean_length": 76.95, "completions/min_length": 59.0, "epoch": 0.8696969696969697, "grad_norm": 1.0177580118179321, "kl": 0.04799273423850536, "learning_rate": 2.561721369509704e-07, "loss": 0.06008824110031128, "memory(GiB)": 69.34, "reward": 0.804158678650856, "reward_std": 0.3547975242137909, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.2541587010025978, "rewards/ReportKG_Jaccard/std": 0.05859261527657509, "step": 4305, "train_speed(iter/s)": 0.036647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.6, "completions/mean_length": 70.7, "completions/min_length": 53.0, "epoch": 0.8707070707070707, "grad_norm": 1.126837968826294, "kl": 0.034251698106527326, "learning_rate": 2.523012458604301e-07, "loss": -0.019963735342025758, "memory(GiB)": 69.34, "reward": 0.7112439781427383, "reward_std": 0.43701767921447754, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.4191516935825348, "rewards/ReportKG_Jaccard/mean": 0.1362439773976803, "rewards/ReportKG_Jaccard/std": 0.03734003975987434, "step": 4310, "train_speed(iter/s)": 0.036647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.4, "completions/mean_length": 81.375, "completions/min_length": 62.0, "epoch": 0.8717171717171717, "grad_norm": 1.219338059425354, "kl": 0.03079035598784685, "learning_rate": 2.484578536741766e-07, "loss": -0.04279496371746063, "memory(GiB)": 69.34, "reward": 0.8744830429553986, "reward_std": 0.27508836686611177, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.24948309659957885, "rewards/ReportKG_Jaccard/std": 0.0777359165251255, "step": 4315, "train_speed(iter/s)": 0.036639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.8, "completions/mean_length": 67.05, "completions/min_length": 49.8, "epoch": 0.8727272727272727, "grad_norm": 1.5687891244888306, "kl": 0.03296720683574676, "learning_rate": 2.446420208654887e-07, "loss": 0.037347492575645444, "memory(GiB)": 69.34, "reward": 1.100601279735565, "reward_std": 0.31103203594684603, "rewards/MultiModalAccuracyORM_Any/mean": 0.8, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.30060129761695864, "rewards/ReportKG_Jaccard/std": 0.09207391738891602, "step": 4320, "train_speed(iter/s)": 0.036641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.2, "completions/mean_length": 64.6, "completions/min_length": 46.6, "epoch": 0.8737373737373737, "grad_norm": 1.4702564477920532, "kl": 0.019469849392771722, "learning_rate": 2.408538074740167e-07, "loss": 0.033822661638259886, "memory(GiB)": 69.34, "reward": 0.5228290915489197, "reward_std": 0.37594123035669325, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.22282907366752625, "rewards/ReportKG_Jaccard/std": 0.10098379850387573, "step": 4325, "train_speed(iter/s)": 0.036652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.8, "completions/mean_length": 68.375, "completions/min_length": 53.4, "epoch": 0.8747474747474747, "grad_norm": 1.3318545818328857, "kl": 0.03582445122301579, "learning_rate": 2.3709327310483608e-07, "loss": 0.09234139919281006, "memory(GiB)": 69.34, "reward": 0.8625025093555451, "reward_std": 0.3511525303125381, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.31250251829624176, "rewards/ReportKG_Jaccard/std": 0.11935575157403946, "step": 4330, "train_speed(iter/s)": 0.036645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 82.6, "completions/mean_length": 64.425, "completions/min_length": 52.6, "epoch": 0.8757575757575757, "grad_norm": 1.0693368911743164, "kl": 0.04010193608701229, "learning_rate": 2.3336047692751214e-07, "loss": 0.03125073313713074, "memory(GiB)": 69.34, "reward": 1.0945130705833435, "reward_std": 0.3475583553314209, "rewards/MultiModalAccuracyORM_Any/mean": 0.875, "rewards/MultiModalAccuracyORM_Any/std": 0.30471404194831847, "rewards/ReportKG_Jaccard/mean": 0.21951308846473694, "rewards/ReportKG_Jaccard/std": 0.07474917769432068, "step": 4335, "train_speed(iter/s)": 0.036643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.4, "completions/mean_length": 78.875, "completions/min_length": 62.4, "epoch": 0.8767676767676768, "grad_norm": 1.2865208387374878, "kl": 0.04428022354841232, "learning_rate": 2.2965547767516714e-07, "loss": 0.009716839343309403, "memory(GiB)": 69.34, "reward": 0.7914891839027405, "reward_std": 0.47259712517261504, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.41743398904800416, "rewards/ReportKG_Jaccard/mean": 0.21648917123675346, "rewards/ReportKG_Jaccard/std": 0.0670377105474472, "step": 4340, "train_speed(iter/s)": 0.036647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.0, "completions/mean_length": 76.35, "completions/min_length": 55.6, "epoch": 0.8777777777777778, "grad_norm": 1.4679462909698486, "kl": 0.023988673649728297, "learning_rate": 2.2597833364355655e-07, "loss": 0.027081114053726197, "memory(GiB)": 69.34, "reward": 0.8187852442264557, "reward_std": 0.3689376816153526, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.24378523230552673, "rewards/ReportKG_Jaccard/std": 0.06871328130364418, "step": 4345, "train_speed(iter/s)": 0.036642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.4, "completions/mean_length": 77.875, "completions/min_length": 63.4, "epoch": 0.8787878787878788, "grad_norm": 1.3071750402450562, "kl": 0.032935333997011186, "learning_rate": 2.2232910269015326e-07, "loss": -0.0029394064098596574, "memory(GiB)": 69.34, "reward": 0.7715178310871125, "reward_std": 0.44656228423118594, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.4225463569164276, "rewards/ReportKG_Jaccard/mean": 0.22151785641908645, "rewards/ReportKG_Jaccard/std": 0.07943041771650314, "step": 4350, "train_speed(iter/s)": 0.036647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 81.6, "completions/mean_length": 67.2, "completions/min_length": 48.8, "epoch": 0.8797979797979798, "grad_norm": 1.4221762418746948, "kl": 0.01719949468970299, "learning_rate": 2.1870784223323445e-07, "loss": -0.02114272564649582, "memory(GiB)": 69.34, "reward": 0.5564346969127655, "reward_std": 0.4910739302635193, "rewards/MultiModalAccuracyORM_Any/mean": 0.3, "rewards/MultiModalAccuracyORM_Any/std": 0.4410230278968811, "rewards/ReportKG_Jaccard/mean": 0.2564346954226494, "rewards/ReportKG_Jaccard/std": 0.07984972596168519, "step": 4355, "train_speed(iter/s)": 0.036657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 86.0, "completions/mean_length": 74.125, "completions/min_length": 61.2, "epoch": 0.8808080808080808, "grad_norm": 1.0412875413894653, "kl": 0.0403218325227499, "learning_rate": 2.1511460925098014e-07, "loss": 0.022187231481075285, "memory(GiB)": 69.34, "reward": 1.1875709295272827, "reward_std": 0.16287218034267426, "rewards/MultiModalAccuracyORM_Any/mean": 0.9, "rewards/MultiModalAccuracyORM_Any/std": 0.10690449476242066, "rewards/ReportKG_Jaccard/mean": 0.28757094740867617, "rewards/ReportKG_Jaccard/std": 0.08538681119680405, "step": 4360, "train_speed(iter/s)": 0.036661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.6, "completions/mean_length": 70.4, "completions/min_length": 54.4, "epoch": 0.8818181818181818, "grad_norm": 1.2358342409133911, "kl": 0.025746802799403667, "learning_rate": 2.115494602805774e-07, "loss": 0.025959575176239015, "memory(GiB)": 69.34, "reward": 0.8400613725185394, "reward_std": 0.3190171577036381, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.2828427076339722, "rewards/ReportKG_Jaccard/mean": 0.24006137251853943, "rewards/ReportKG_Jaccard/std": 0.07610496804118157, "step": 4365, "train_speed(iter/s)": 0.036669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 96.8, "completions/mean_length": 73.85, "completions/min_length": 53.4, "epoch": 0.8828282828282829, "grad_norm": 1.3496688604354858, "kl": 0.03083487059921026, "learning_rate": 2.080124514173285e-07, "loss": -0.034712168574333194, "memory(GiB)": 69.34, "reward": 0.624084335565567, "reward_std": 0.40416252315044404, "rewards/MultiModalAccuracyORM_Any/mean": 0.375, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.24908433556556703, "rewards/ReportKG_Jaccard/std": 0.05546487979590893, "step": 4370, "train_speed(iter/s)": 0.036672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 120.0, "completions/mean_length": 85.9, "completions/min_length": 58.8, "epoch": 0.8838383838383839, "grad_norm": 1.383213996887207, "kl": 0.03028872311115265, "learning_rate": 2.0450363831376927e-07, "loss": 0.061299574375152585, "memory(GiB)": 69.34, "reward": 0.8086188673973084, "reward_std": 0.4292404010891914, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.18361885249614715, "rewards/ReportKG_Jaccard/std": 0.07556530311703683, "step": 4375, "train_speed(iter/s)": 0.036668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.0, "completions/mean_length": 69.3, "completions/min_length": 56.0, "epoch": 0.8848484848484849, "grad_norm": 1.4232584238052368, "kl": 0.02189805470407009, "learning_rate": 2.0102307617879367e-07, "loss": 0.05139451026916504, "memory(GiB)": 69.34, "reward": 0.6979022026062012, "reward_std": 0.2908898785710335, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.19790220409631729, "rewards/ReportKG_Jaccard/std": 0.06560153625905514, "step": 4380, "train_speed(iter/s)": 0.036674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.2, "completions/mean_length": 72.8, "completions/min_length": 56.2, "epoch": 0.8858585858585859, "grad_norm": 1.0848309993743896, "kl": 0.051332151889801024, "learning_rate": 1.9757081977678625e-07, "loss": 0.043299734592437744, "memory(GiB)": 69.34, "reward": 1.0009607553482056, "reward_std": 0.3532897293567657, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.2509607195854187, "rewards/ReportKG_Jaccard/std": 0.05895942412316799, "step": 4385, "train_speed(iter/s)": 0.036682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.4, "completions/mean_length": 68.625, "completions/min_length": 48.4, "epoch": 0.8868686868686869, "grad_norm": 1.246748924255371, "kl": 0.038853398710489276, "learning_rate": 1.9414692342675787e-07, "loss": 0.10702499151229858, "memory(GiB)": 69.34, "reward": 1.133196759223938, "reward_std": 0.2626126348972321, "rewards/MultiModalAccuracyORM_Any/mean": 0.85, "rewards/MultiModalAccuracyORM_Any/std": 0.19948650598526002, "rewards/ReportKG_Jaccard/mean": 0.28319678604602816, "rewards/ReportKG_Jaccard/std": 0.07360673695802689, "step": 4390, "train_speed(iter/s)": 0.036695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.2, "completions/mean_length": 72.55, "completions/min_length": 56.8, "epoch": 0.8878787878787879, "grad_norm": 1.430738925933838, "kl": 0.05380360186100006, "learning_rate": 1.907514410014923e-07, "loss": 0.01512928307056427, "memory(GiB)": 69.34, "reward": 0.6585203409194946, "reward_std": 0.34971232563257215, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3265853762626648, "rewards/ReportKG_Jaccard/mean": 0.1585203304886818, "rewards/ReportKG_Jaccard/std": 0.05744030475616455, "step": 4395, "train_speed(iter/s)": 0.036697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 117.6, "completions/mean_length": 81.925, "completions/min_length": 60.6, "epoch": 0.8888888888888888, "grad_norm": 1.5723803043365479, "kl": 0.026613380759954453, "learning_rate": 1.8738442592670011e-07, "loss": -0.046462804079055786, "memory(GiB)": 69.34, "reward": 0.6466650843620301, "reward_std": 0.4907595753669739, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.47382218241691587, "rewards/ReportKG_Jaccard/mean": 0.146665059030056, "rewards/ReportKG_Jaccard/std": 0.05596293658018112, "step": 4400, "train_speed(iter/s)": 0.036706 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 120.0, "completions/mean_length": 91.95, "completions/min_length": 66.6, "epoch": 0.8898989898989899, "grad_norm": 0.9595813751220703, "kl": 0.024004483968019484, "learning_rate": 1.8404593118017542e-07, "loss": 0.004383274912834167, "memory(GiB)": 69.34, "reward": 0.8729459643363953, "reward_std": 0.30783375799655915, "rewards/MultiModalAccuracyORM_Any/mean": 0.65, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.22294597029685975, "rewards/ReportKG_Jaccard/std": 0.05508080758154392, "step": 4405, "train_speed(iter/s)": 0.036697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 100.6, "completions/mean_length": 80.075, "completions/min_length": 61.8, "epoch": 0.8909090909090909, "grad_norm": 1.2766231298446655, "kl": 0.02241165339946747, "learning_rate": 1.8073600929096311e-07, "loss": -0.008774621039628982, "memory(GiB)": 69.34, "reward": 0.6080082356929779, "reward_std": 0.3636226385831833, "rewards/MultiModalAccuracyORM_Any/mean": 0.4, "rewards/MultiModalAccuracyORM_Any/std": 0.31564186215400697, "rewards/ReportKG_Jaccard/mean": 0.20800824463367462, "rewards/ReportKG_Jaccard/std": 0.06852171421051026, "step": 4410, "train_speed(iter/s)": 0.036696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.8, "completions/mean_length": 66.15, "completions/min_length": 47.6, "epoch": 0.8919191919191919, "grad_norm": 1.264241099357605, "kl": 0.04432574808597565, "learning_rate": 1.7745471233853438e-07, "loss": 0.10652459859848022, "memory(GiB)": 69.34, "reward": 0.8928789883852005, "reward_std": 0.2543985083699226, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.17422050833702088, "rewards/ReportKG_Jaccard/mean": 0.19287902265787124, "rewards/ReportKG_Jaccard/std": 0.10065826997160912, "step": 4415, "train_speed(iter/s)": 0.036701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 99.0, "completions/mean_length": 70.075, "completions/min_length": 47.6, "epoch": 0.8929292929292929, "grad_norm": 0.9597105383872986, "kl": 0.016542466729879378, "learning_rate": 1.7420209195196445e-07, "loss": 0.05115799903869629, "memory(GiB)": 69.34, "reward": 0.7114554643630981, "reward_std": 0.3207260563969612, "rewards/MultiModalAccuracyORM_Any/mean": 0.525, "rewards/MultiModalAccuracyORM_Any/std": 0.25587469935417173, "rewards/ReportKG_Jaccard/mean": 0.18645548224449157, "rewards/ReportKG_Jaccard/std": 0.08353657871484757, "step": 4420, "train_speed(iter/s)": 0.036696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 107.8, "completions/mean_length": 83.4, "completions/min_length": 65.6, "epoch": 0.8939393939393939, "grad_norm": 1.2049407958984375, "kl": 0.02104573044925928, "learning_rate": 1.7097819930912128e-07, "loss": 0.028942856192588805, "memory(GiB)": 69.34, "reward": 0.9467063158750534, "reward_std": 0.26298933029174804, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.1960918426513672, "rewards/ReportKG_Jaccard/mean": 0.2717063158750534, "rewards/ReportKG_Jaccard/std": 0.07571593970060349, "step": 4425, "train_speed(iter/s)": 0.036698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.4, "completions/mean_length": 73.275, "completions/min_length": 51.6, "epoch": 0.8949494949494949, "grad_norm": 0.993206262588501, "kl": 0.01841811239719391, "learning_rate": 1.677830851358608e-07, "loss": -0.03373030722141266, "memory(GiB)": 69.34, "reward": 0.9364385008811951, "reward_std": 0.3807493031024933, "rewards/MultiModalAccuracyORM_Any/mean": 0.675, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.2614385113120079, "rewards/ReportKG_Jaccard/std": 0.07693158313632012, "step": 4430, "train_speed(iter/s)": 0.036698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 104.2, "completions/mean_length": 73.975, "completions/min_length": 48.6, "epoch": 0.895959595959596, "grad_norm": 1.2177072763442993, "kl": 0.026626171171665193, "learning_rate": 1.646167997052288e-07, "loss": 0.005354158580303192, "memory(GiB)": 69.34, "reward": 0.6844788074493409, "reward_std": 0.3961707428097725, "rewards/MultiModalAccuracyORM_Any/mean": 0.5, "rewards/MultiModalAccuracyORM_Any/std": 0.3593845307826996, "rewards/ReportKG_Jaccard/mean": 0.18447882533073426, "rewards/ReportKG_Jaccard/std": 0.06237626299262047, "step": 4435, "train_speed(iter/s)": 0.036704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.4, "completions/mean_length": 70.55, "completions/min_length": 56.2, "epoch": 0.896969696969697, "grad_norm": 1.4099088907241821, "kl": 0.03665359988808632, "learning_rate": 1.614793928366689e-07, "loss": 0.024795301258563995, "memory(GiB)": 69.34, "reward": 0.6727918386459351, "reward_std": 0.43249144554138186, "rewards/MultiModalAccuracyORM_Any/mean": 0.425, "rewards/MultiModalAccuracyORM_Any/std": 0.39729605317115785, "rewards/ReportKG_Jaccard/mean": 0.24779184758663178, "rewards/ReportKG_Jaccard/std": 0.06031664237380028, "step": 4440, "train_speed(iter/s)": 0.036719 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.2, "completions/mean_length": 72.5, "completions/min_length": 53.0, "epoch": 0.897979797979798, "grad_norm": 1.0614506006240845, "kl": 0.03241662085056305, "learning_rate": 1.5837091389523938e-07, "loss": 0.0016322802752256394, "memory(GiB)": 69.34, "reward": 1.0396076679229735, "reward_std": 0.14887791126966476, "rewards/MultiModalAccuracyORM_Any/mean": 0.625, "rewards/MultiModalAccuracyORM_Any/std": 0.07071067690849304, "rewards/ReportKG_Jaccard/mean": 0.41460766792297366, "rewards/ReportKG_Jaccard/std": 0.1151545450091362, "step": 4445, "train_speed(iter/s)": 0.036714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 93.8, "completions/mean_length": 69.925, "completions/min_length": 51.8, "epoch": 0.898989898989899, "grad_norm": 1.181774377822876, "kl": 0.016542517580091953, "learning_rate": 1.552914117908375e-07, "loss": 0.07460694313049317, "memory(GiB)": 69.34, "reward": 0.8106270849704742, "reward_std": 0.29245339781045915, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.26680251955986023, "rewards/ReportKG_Jaccard/mean": 0.2606270670890808, "rewards/ReportKG_Jaccard/std": 0.06048469766974449, "step": 4450, "train_speed(iter/s)": 0.036722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.0, "completions/mean_length": 74.725, "completions/min_length": 55.6, "epoch": 0.9, "grad_norm": 1.0334241390228271, "kl": 0.027366739138960837, "learning_rate": 1.5224093497742651e-07, "loss": 0.028676679730415343, "memory(GiB)": 69.34, "reward": 0.8230299830436707, "reward_std": 0.36570112556219103, "rewards/MultiModalAccuracyORM_Any/mean": 0.55, "rewards/MultiModalAccuracyORM_Any/std": 0.299601674079895, "rewards/ReportKG_Jaccard/mean": 0.2730299770832062, "rewards/ReportKG_Jaccard/std": 0.08215078264474869, "step": 4455, "train_speed(iter/s)": 0.036728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.4, "completions/mean_length": 79.675, "completions/min_length": 61.8, "epoch": 0.901010101010101, "grad_norm": 1.2712750434875488, "kl": 0.03431402314454317, "learning_rate": 1.4921953145227772e-07, "loss": 0.06688117384910583, "memory(GiB)": 69.34, "reward": 1.021264898777008, "reward_std": 0.3293548747897148, "rewards/MultiModalAccuracyORM_Any/mean": 0.75, "rewards/MultiModalAccuracyORM_Any/std": 0.2777460336685181, "rewards/ReportKG_Jaccard/mean": 0.2712648928165436, "rewards/ReportKG_Jaccard/std": 0.07271129712462425, "step": 4460, "train_speed(iter/s)": 0.036734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 90.2, "completions/mean_length": 70.875, "completions/min_length": 55.6, "epoch": 0.902020202020202, "grad_norm": 1.1525942087173462, "kl": 0.0262862004339695, "learning_rate": 1.4622724875521296e-07, "loss": 0.04294489026069641, "memory(GiB)": 69.34, "reward": 0.8213777720928193, "reward_std": 0.4193357199430466, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.3552303433418274, "rewards/ReportKG_Jaccard/mean": 0.2213777631521225, "rewards/ReportKG_Jaccard/std": 0.07868191488087177, "step": 4465, "train_speed(iter/s)": 0.03674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.8, "completions/mean_length": 64.925, "completions/min_length": 50.2, "epoch": 0.9030303030303031, "grad_norm": 1.2242063283920288, "kl": 0.046152626350522044, "learning_rate": 1.4326413396785485e-07, "loss": -0.0027018189430236816, "memory(GiB)": 69.34, "reward": 0.8625436067581177, "reward_std": 0.26384113281965255, "rewards/MultiModalAccuracyORM_Any/mean": 0.575, "rewards/MultiModalAccuracyORM_Any/std": 0.21213203072547912, "rewards/ReportKG_Jaccard/mean": 0.28754361271858214, "rewards/ReportKG_Jaccard/std": 0.0869012750685215, "step": 4470, "train_speed(iter/s)": 0.036744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 85.6, "completions/mean_length": 70.425, "completions/min_length": 52.6, "epoch": 0.9040404040404041, "grad_norm": 1.376861572265625, "kl": 0.036660040728747845, "learning_rate": 1.403302337128902e-07, "loss": -0.005170682072639465, "memory(GiB)": 69.34, "reward": 1.1333346009254455, "reward_std": 0.20455770641565324, "rewards/MultiModalAccuracyORM_Any/mean": 0.95, "rewards/MultiModalAccuracyORM_Any/std": 0.1414213538169861, "rewards/ReportKG_Jaccard/mean": 0.18333462476730347, "rewards/ReportKG_Jaccard/std": 0.07627133429050445, "step": 4475, "train_speed(iter/s)": 0.036759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 83.6, "completions/mean_length": 63.575, "completions/min_length": 51.0, "epoch": 0.9050505050505051, "grad_norm": 1.0013772249221802, "kl": 0.029730130918323994, "learning_rate": 1.3742559415333267e-07, "loss": 0.09773817062377929, "memory(GiB)": 69.34, "reward": 1.134416389465332, "reward_std": 0.3935319870710373, "rewards/MultiModalAccuracyORM_Any/mean": 0.825, "rewards/MultiModalAccuracyORM_Any/std": 0.33751319646835326, "rewards/ReportKG_Jaccard/mean": 0.30941641330718994, "rewards/ReportKG_Jaccard/std": 0.0797230713069439, "step": 4480, "train_speed(iter/s)": 0.036757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.6, "completions/mean_length": 65.45, "completions/min_length": 51.6, "epoch": 0.906060606060606, "grad_norm": 1.4492390155792236, "kl": 0.03213022965937853, "learning_rate": 1.3455026099179833e-07, "loss": 0.04307892024517059, "memory(GiB)": 69.34, "reward": 0.9308076500892639, "reward_std": 0.28664499521255493, "rewards/MultiModalAccuracyORM_Any/mean": 0.7, "rewards/MultiModalAccuracyORM_Any/std": 0.23400336503982544, "rewards/ReportKG_Jaccard/mean": 0.23080764412879945, "rewards/ReportKG_Jaccard/std": 0.08218536972999572, "step": 4485, "train_speed(iter/s)": 0.036772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 91.0, "completions/mean_length": 68.775, "completions/min_length": 51.2, "epoch": 0.907070707070707, "grad_norm": 1.9855390787124634, "kl": 0.021896174922585488, "learning_rate": 1.3170427946978669e-07, "loss": 0.050758326053619386, "memory(GiB)": 69.34, "reward": 0.854155170917511, "reward_std": 0.3891630724072456, "rewards/MultiModalAccuracyORM_Any/mean": 0.6, "rewards/MultiModalAccuracyORM_Any/std": 0.34844101667404176, "rewards/ReportKG_Jaccard/mean": 0.25415516793727877, "rewards/ReportKG_Jaccard/std": 0.059644486755132675, "step": 4490, "train_speed(iter/s)": 0.036782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 92.0, "completions/mean_length": 73.9, "completions/min_length": 60.2, "epoch": 0.908080808080808, "grad_norm": 1.5982325077056885, "kl": 0.028883973509073256, "learning_rate": 1.2888769436696721e-07, "loss": 0.02946324348449707, "memory(GiB)": 69.34, "reward": 1.0018215894699096, "reward_std": 0.40191131383180617, "rewards/MultiModalAccuracyORM_Any/mean": 0.775, "rewards/MultiModalAccuracyORM_Any/std": 0.37031235098838805, "rewards/ReportKG_Jaccard/mean": 0.22682158648967743, "rewards/ReportKG_Jaccard/std": 0.0647004995495081, "step": 4495, "train_speed(iter/s)": 0.036783 }, { "epoch": 0.9090909090909091, "grad_norm": 1.2954033613204956, "learning_rate": 1.261005500004768e-07, "loss": 0.04231387078762054, "memory(GiB)": 69.34, "step": 4500, "train_speed(iter/s)": 0.036785 }, { "epoch": 0.9090909090909091, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 1.0, "eval_completions/max_length": 99.82, "eval_completions/mean_length": 76.085, "eval_completions/min_length": 56.52, "eval_kl": 0.028075211308896543, "eval_loss": 0.025887737050652504, "eval_reward": 0.7826179817318917, "eval_reward_std": 0.3160012882202864, "eval_rewards/MultiModalAccuracyORM_Any/mean": 0.57, "eval_rewards/MultiModalAccuracyORM_Any/std": 0.27666777551174165, "eval_rewards/ReportKG_Jaccard/mean": 0.21261798221617936, "eval_rewards/ReportKG_Jaccard/std": 0.0636282580718398, "eval_runtime": 897.1556, "eval_samples_per_second": 0.056, "eval_steps_per_second": 0.008, "step": 4500 } ], "logging_steps": 5, "max_steps": 4950, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }